Zulip Chat Archive
Stream: mathlib4
Topic: Lengths of theorems
Kenny Lau (Jul 10 2025 at 23:49):
I wrote a script to count the number of lines of theorems and defs in Mathlib.
The current longest theorem is 213 lines in engel_isBot_of_isMin in Mathlib/Algebra/Lie/CartanExists.lean#L130.
Kenny Lau (Jul 10 2025 at 23:49):
Python script:
# Put me in a folder in mathlib4
# Add that folder to .git/info/exclude
import os
import re
import sys
freq = dict()
longestTen = []
warn = open("line-count-warnings.txt", "w")
def record(filename, start, count):
global freq, longestTen
if count in freq:
freq[count] += 1
else:
freq[count] = 1
if not longestTen or longestTen[0][0] < count:
longestTen.append((count,filename,start))
longestTen.sort(key=lambda t:t[0])
longestTen = longestTen[-10:]
def process(filename, file):
global warn
i = 0
lineCount = 0
start = 0
state = 0
for line in file.split("\n"):
i += 1
if line[2:] == "--":
pass
elif state == 0: # Outside
if re.search(":=( (by|calc))?$", line):
state = 1
lineCount = 0
start = i
elif state == 1: # Inside theorem/def
if line[:2] == " ":
lineCount += 1
else: # theorem/def ended
record(filename, start, lineCount)
lineCount = 0
if line != "":
warn.write("%d %s\n"%(i,filename)) # Warning: theorem/def ended but not newline
state = 0
fileCount = 0
for root, dirs, files in os.walk('../Mathlib'):
for file in files:
filename = os.path.join(root, file)
with open(filename, "r", encoding="UTF-8") as auto:
fileCount += 1
if fileCount % 100 == 0:
sys.stdout.write("\r%d files processed."%fileCount)
sys.stdout.flush()
process(filename, auto.read())
warn.close()
print("\r%d files processed." % fileCount)
print("%d theorems/defs processed." % sum(freq[i] for i in freq))
print()
keys = freq.keys()
maxKeys = max(keys) # What if `keys` is empty?
log10 = len(str(maxKeys))
for i in range(maxKeys+1):
print("%s: %d"%(str(i).rjust(log10," "), freq.get(i,0)))
print()
print("Longest ten theorems/defs:")
for t in longestTen:
print("%d lines: Line %d in %s"%(t[0], t[2], t[1]))
Kenny Lau (Jul 10 2025 at 23:50):
Output:
6607 files processed.
166886 theorems/defs processed.
0: 187
1: 96891
2: 20776
3: 12312
4: 7433
5: 5690
6: 3957
7: 3273
8: 2466
9: 1881
10: 1636
11: 1352
12: 1103
13: 936
14: 787
15: 669
16: 571
17: 546
18: 454
19: 364
20: 319
21: 306
22: 252
23: 223
24: 206
25: 183
26: 169
27: 148
28: 152
29: 134
30: 126
31: 100
32: 86
33: 76
34: 56
35: 63
36: 65
37: 52
38: 38
39: 58
40: 50
41: 40
42: 39
43: 45
44: 35
45: 25
46: 30
47: 34
48: 34
49: 18
50: 18
51: 20
52: 16
53: 14
54: 9
55: 11
56: 12
57: 7
58: 10
59: 18
60: 9
61: 12
62: 12
63: 10
64: 12
65: 12
66: 8
67: 14
68: 9
69: 13
70: 8
71: 13
72: 5
73: 4
74: 10
75: 5
76: 4
77: 5
78: 3
79: 3
80: 5
81: 6
82: 5
83: 6
84: 3
85: 3
86: 4
87: 6
88: 3
89: 4
90: 3
91: 3
92: 5
93: 2
94: 3
95: 3
96: 1
97: 1
98: 3
99: 3
100: 6
101: 6
102: 1
103: 4
104: 0
105: 0
106: 1
107: 3
108: 2
109: 0
110: 2
111: 0
112: 0
113: 2
114: 1
115: 1
116: 1
117: 0
118: 1
119: 0
120: 2
121: 2
122: 1
123: 0
124: 1
125: 2
126: 0
127: 4
128: 1
129: 1
130: 0
131: 0
132: 0
133: 2
134: 0
135: 1
136: 0
137: 0
138: 1
139: 0
140: 0
141: 1
142: 2
143: 0
144: 0
145: 0
146: 0
147: 0
148: 2
149: 1
150: 1
151: 1
152: 0
153: 0
154: 1
155: 0
156: 0
157: 1
158: 0
159: 1
160: 0
161: 0
162: 0
163: 0
164: 0
165: 0
166: 0
167: 0
168: 0
169: 0
170: 0
171: 0
172: 0
173: 0
174: 0
175: 0
176: 0
177: 0
178: 0
179: 0
180: 0
181: 1
182: 0
183: 0
184: 0
185: 0
186: 0
187: 0
188: 0
189: 0
190: 0
191: 0
192: 0
193: 0
194: 0
195: 0
196: 0
197: 0
198: 0
199: 0
200: 0
201: 0
202: 0
203: 0
204: 0
205: 0
206: 0
207: 0
208: 0
209: 0
210: 0
211: 0
212: 0
213: 1
Kenny Lau (Jul 10 2025 at 23:50):
Longest ten theorems/defs:
148 lines: Line 250 in ../Mathlib\MeasureTheory\Covering\Vitali.lean
148 lines: Line 673 in ../Mathlib\SetTheory\Game\Basic.lean
149 lines: Line 174 in ../Mathlib\MeasureTheory\Integral\RieszMarkovKakutani\Real.lean
150 lines: Line 30 in ../Mathlib\Analysis\SpecificLimits\FloorPow.lean
151 lines: Line 745 in ../Mathlib\Topology\MetricSpace\GromovHausdorff.lean
154 lines: Line 157 in ../Mathlib\CategoryTheory\Limits\FilteredColimitCommutesFiniteLimit.lean
157 lines: Line 435 in ../Mathlib\Algebra\Lie\Weights\RootSystem.lean
159 lines: Line 859 in ../Mathlib\MeasureTheory\Covering\Besicovitch.lean
181 lines: Line 196 in ../Mathlib\MeasureTheory\Integral\Layercake.lean
213 lines: Line 130 in ../Mathlib\Algebra\Lie\CartanExists.lean
Kenny Lau (Jul 10 2025 at 23:52):
cc @Bhavik Mehta
Kenny Lau (Jul 10 2025 at 23:53):
so 58.1% of the theorems are one-liners
Kenny Lau (Jul 10 2025 at 23:57):
N = 166886
mean = 3.29553
stdev = 6.13141
N = sum(freq[i] for i in keys)
E = sum(i*freq[i] for i in keys)/N
E2 = sum(i*i*freq[i] for i in keys)/N
s = (E2-E*E)**.5
print("N = %d"%N)
print("mean = %.5f"%E)
print("stdev = %.5f"%s)
Bhavik Mehta (Jul 11 2025 at 00:04):
The frequency data, on a log scale, and the sheet to compute it
Damiano Testa (Jul 11 2025 at 02:13):
I have already seen this list, but I can't find the topic...
Damiano Testa (Jul 11 2025 at 03:05):
I had not posted it maybe, but this is what I had:
import Mathlib
open Lean Elab Command
/--
info: (424, 1) #[Lean.Language.Lean.process]
(354, 1) #[Lean.Compiler.LCNF.ToLCNF.toLCNF]
(311, 1) #[Mathlib.Linter.DirectoryDependency.forbiddenImportDirs]
(273, 1) #[Lean.Elab.Tactic.BVDecide.Frontend.ReifiedBVExpr.of]
(271, 1) #[Lean.Parser.Tactic.grind]
(266, 1) #[Std.Tactic.BVDecide.LRAT.Internal.DefaultFormula.insertUnitInvariant_insertUnit]
(241, 2) #[_aux_Init_Data_UInt_Lemmas___macroRules_commandDeclare_uint_theorems___1, commandDeclare_uint_theorems__]
(240, 1) #[Lean.Tactic.FunInd.deriveInductionStructural]
(227, 1) #[Lean.Tactic.FunInd.deriveInductionStructural.doRealize]
(222, 2) #[LieAlgebra.engel_isBot_of_isMin, _private.Lean.Meta.Match.MatchEqs.0.Lean.Meta.Match.mkSplitterProof]
(219, 2) #[Lean.Expr, Lean.Expr.data]
(215, 1) #[Lean.Elab.Tactic.Omega.asLinearComboImpl]
(214, 1) #[ToAdditive.to_additive]
(208, 1) #[Std.Tactic.BVDecide.BVExpr.bitblast.go_denote_eq]
(207, 1) #[Std.Tactic.BVDecide.LRAT.Internal.DefaultFormula.confirmRupHint_preserves_invariant_helper]
(206, 1) #[Lean.Elab.Term.CollectPatternVars.collect]
(201, 1) #[Lean.Elab.Tactic.ElimApp.evalAlts]
(199, 1) #[Lean.Meta.Simp.simpHaveTelescope]
(198, 1) #[Vitali.exists_disjoint_covering_ae]
(196, 1) #[MeasureTheory.lintegral_comp_eq_lintegral_meas_le_mul_of_measurable]
(194, 1) #[Lean.Language.Lean.process.parseCmd]
(193, 1) #[Lean.Server.Test.Runner.main]
(192, 1) #[Mathlib.Tactic.Monoidal.instMkEvalHorizontalCompMonoidalM]
(189, 1) #[Lean.Meta.mkRichHCongr]
(188, 2) #[Std.Tactic.BVDecide.BVExpr.bitblast, _private.Lean.Elab.Quotation.0.Lean.Elab.Term.Quotation.getHeadInfo]
(176, 2) #[Mathlib.Notation3._aux_Mathlib_Util_Notation3___elabRules_Mathlib_Notation3_notation3_1, Mathlib.Notation3.notation3]
(174, 1) #[Std.Tactic.BVDecide.BVExpr.bitblast.go]
(171, 1) #[Lean.Meta.MatcherApp.transform]
(169, 1) #[Lean.Elab.Term.elabMutualDef]
(167, 2) #[Besicovitch.exists_closedBall_covering_tsum_measure_le, Lean.Meta.IndPredBelow.mkBelowMatcher]
(166, 1) #[Lean.Meta.Hint.readableDiff]
(164, 1) #[Lean.resolveLocalName]
(162, 2) #[Lean.IR.ToIR.lowerLet, Mathlib.Tactic.CC.CCM.addEqvStep]
(161, 2) #[Lean.Elab.Tactic.BVDecide.Frontend.ReifiedBVExpr.of.go, LieAlgebra.IsKilling.eq_top_of_invtSubmodule_ne_bot]
(160, 3) #[GromovHausdorff.totallyBounded, Lean.Elab.Term.toParserDescr, Lean.Meta.Grind.Arith.Linear.getStructId?]
(159, 1) #[CategoryTheory.Limits.colimitLimitToLimitColimit_surjective]
(157, 1) #[tendsto_div_of_monotone_of_exists_subseq_tendsto_div]
(156, 1) #[Mathlib.Tactic.Monoidal.instMonadMor₂MonoidalM]
(154, 2) #[Lean.Tactic.FunInd.buildInductionBody, Lean.Tactic.FunInd.foldAndCollect]
(153, 1) #[Mathlib.Tactic.Bicategory.instMonadMor₂BicategoryM]
(152, 3) #[Lean.Elab.PartialFixpoint.deriveInduction, Mathlib.Tactic.TermCongr.mkCongrOf, _private.Mathlib.MeasureTheory.Integral.RieszMarkovKakutani.Real.0.RealRMK.integral_riesz_aux]
(151, 3) #[Aesop.addRappUnsafe, Std.Time.Modifier, _private.Mathlib.RingTheory.Spectrum.Prime.ChevalleyComplexity.0.ChevalleyThm.PolynomialC.induction_aux]
(150, 3) #[Lean.Elab.Tactic.BVDecide.Frontend.Normalize.isSupportedMatch, Lean.Meta.Grind.Arith.Linear.getStructId?.go?, Lean.Meta.addPPExplicitToExposeDiff]
(149, 2) #[Lean.Meta.ACLt.main, SetTheory.PGame.quot_mul_assoc]
(148, 1) #[MeasureTheory.measurableSet_range_of_continuous_injective]
(147, 4)
(146, 1) #[AkraBazziRecurrence.smoothingFn_mul_asympBound_isBigO_T]
(145, 1) #[Lean.Compiler.LCNF.Simp.simpJpCases?]
(144, 2) #[Lean.Elab.partialFixpoint, VitaliFamily.exists_measurable_supersets_limRatio]
(143, 3) #[Lean.PrettyPrinter.Delaborator.TopDownAnalyze.analyzeAppStagedCore, Lean.Server.FileWorker.locationLinksOfInfo, eVariationOn.add_point]
(142, 1) #[Besicovitch.exists_disjoint_closedBall_covering_ae_of_finiteMeasure_aux]
(140, 2) #[AlgebraicGeometry.Proj.valuativeCriterion_existence_aux, Mathlib.Tactic.ITauto.applyProof]
(138, 2) #[GromovHausdorff.hausdorffDist_optimal, MeasureTheory.hasFDerivAt_convolution_right_with_param]
(137, 1) #[exists_closed_cover_approximatesLinearOn_of_hasFDerivWithinAt]
(136, 3) #[ApproximatesLinearOn.surjOn_closedBall_of_nonlinearRightInverse, FDerivMeasurableAux.D_subset_differentiable_set, Lean.Meta.Simp.Config]
(135, 4)
(134, 1) #[MultilinearMap.map_sum_finset_aux]
(132, 4)
(131, 2) #[Lean.Compiler.LCNF.Simp.simp, Submodule.basis_of_pid_aux]
(130, 2) #[ContMDiffWithinAt.mfderivWithin, Mathlib.Tactic.Bicategory.instMkEvalWhiskerRightBicategoryM]
(129, 3) #[Aesop.Options, Besicovitch.exist_finset_disjoint_balls_large_measure, Mathlib.Tactic.CC.CCM.addEqvStep.go]
(128, 3) #[AddCircle.addWellApproximable_ae_empty_or_univ, MeasureTheory.hahn_decomposition, _private.Lean.Elab.Inductive.0.Lean.Elab.Command.elabCtors]
(127, 2) #[Mathlib.Tactic.Bicategory.instMonadNormalizeNaturalityBicategoryM, _private.Lean.Data.FuzzyMatching.0.Lean.FuzzyMatching.fuzzyMatchCore]
(126, 6)
(125, 2) #[EMetric.instParacompactSpace, Lean.Meta.Simp.simpHaveTelescope.simpHaveTelescopeAux]
(124, 1) #[_private.Mathlib.RingTheory.Spectrum.Prime.ChevalleyComplexity.0.ChevalleyThm.PolynomialC.statement]
(123, 1) #[Std.Tactic.BVDecide.LRAT.Internal.DefaultFormula.assignmentsInvariant_insertRupUnits_of_assignmentsInvariant]
(122, 5)
(121, 2) #[Lean.Elab.Tactic.BVDecide.Frontend.Normalize.intToBitVecPass, Lean.Parser.Command.variable]
(120, 2) #[MeasureTheory.eLpNorm_le_eLpNorm_fderiv_of_eq_inner, _private.Lean.Server.FileWorker.0.Lean.Server.FileWorker.reportSnapshots]
(119, 6)
(118, 2) #[Besicovitch.TauPackage.color_lt, _private.Std.Time.Format.Basic.0.Std.Time.formatWith]
(117, 4)
(116, 4)
(115, 4)
(114, 3) #[Lean.Meta.Simp.tryAutoCongrTheorem?, Lean.Parser.Command.open, _private.Mathlib.Combinatorics.SimpleGraph.Tutte.0.SimpleGraph.tutte_exists_isPerfectMatching_of_near_matchings]
(113, 11)
(112, 4)
(111, 4)
(110, 3) #[Lean.Elab.Term.elabLetDeclAux, Lean.Tactic.FunInd.deriveUnaryInduction, _private.Std.Time.Format.Basic.0.Std.Time.parseWith]
(109, 4)
(108, 5)
(107, 2) #[Lean.Meta.unifyEq?, MeasureTheory.GridLines.T_insert_le_T_lmarginal_singleton]
(106, 7)
(105, 4)
(104, 10)
(103, 6)
(102, 4)
(101, 6)
(100, 5)
(99, 7)
(98, 6)
(97, 4)
(96, 11)
(95, 8)
(94, 6)
(93, 4)
(92, 8)
(91, 7)
(90, 10)
(89, 8)
(88, 17)
(87, 4)
(86, 9)
(85, 10)
(84, 8)
(83, 14)
(82, 12)
(81, 11)
(80, 13)
(79, 14)
(78, 15)
(77, 15)
(76, 22)
(75, 23)
(74, 14)
(73, 20)
(72, 26)
(71, 24)
(70, 27)
(69, 22)
(68, 20)
(67, 25)
(66, 21)
(65, 36)
(64, 27)
(63, 26)
(62, 33)
(61, 27)
(60, 31)
(59, 34)
(58, 39)
(57, 34)
(56, 29)
(55, 44)
(54, 40)
(53, 43)
(52, 51)
(51, 51)
(50, 62)
(49, 65)
(48, 87)
(47, 71)
(46, 84)
(45, 103)
(44, 79)
(43, 97)
(42, 102)
(41, 114)
(40, 124)
(39, 148)
(38, 149)
(37, 157)
(36, 183)
(35, 193)
(34, 207)
(33, 242)
(32, 275)
(31, 294)
(30, 280)
(29, 352)
(28, 369)
(27, 428)
(26, 428)
(25, 502)
(24, 576)
(23, 662)
(22, 716)
(21, 820)
(20, 955)
(19, 1101)
(18, 1270)
(17, 1537)
(16, 1724)
(15, 2030)
(14, 2362)
(13, 2964)
(12, 3516)
(11, 4406)
(10, 5364)
(9, 6915)
(8, 9206)
(7, 12304)
(6, 16900)
(5, 24863)
(4, 41164)
(3, 65002)
(2, 56790)
(1, 76074)
-/
#guard_msgs in
run_cmd
let env ← getEnv
let csts := env.constants.map₁
let mut lengthToNames : Std.HashMap Nat NameSet := ∅
for (nm, _) in csts do
if let some rg := declRangeExt.find? env nm then
lengthToNames := lengthToNames.alter (rg.range.endPos.line - rg.range.pos.line + 1) (·.getD ∅ |>.insert nm)
for (l, nms) in lengthToNames.toArray.qsort (·.1 > ·.1) do
dbg_trace "{(l, nms.size)}{if nms.size ≤ 3 then s!" {nms.toArray.qsort (·.toString < ·.toString)}" else ""}"
Damiano Testa (Jul 11 2025 at 03:06):
It shows the number of lines, followed by how many declarations of that length there are. When there are at most 3 declarations, it also prints their names.
Kenny Lau (Jul 11 2025 at 03:12):
@Damiano Testa could you share its output?
Damiano Testa (Jul 11 2025 at 03:13):
Sure, I updated the message above!
Damiano Testa (Jul 11 2025 at 03:13):
Note that the list above includes all declarations that are actually written somewhere and also includes the docstrings.
Damiano Testa (Jul 11 2025 at 03:16):
Ah, now with the full name, I found the message!
import Mathlib
open Lean Elab Command
structure Lengths where
repos : Array Name
lengths : Array (Std.HashMap Nat (Array Name)) := List.replicate repos.size ∅ |>.toArray
def Lengths.add (ls : Lengths) (repo : Name) (l : Nat) (decl : Name) : Lengths :=
if let some idx := ls.repos.findIdx? (· == repo) then
{ls with lengths := ls.lengths.modify idx (·.alter l (·.getD #[] |>.push decl))}
else
ls
def Data_toMessageData (repo : Name) (dat : Array (Nat × (Array Name))) (threshold : Nat) : MessageData :=
let tot := dat.foldl (init := 0) fun t (_, names) => t + names.size
let min := dat[0]!.1
let header := m!"{tot} decls, {dat.size} lengths between {min}-{dat.back!.1}"
.trace {cls := repo} header <| #[m!"Length: No of decls"] ++ (dat.map fun (lth, nms) => m!"\nlength {lth}: decls {nms.size}" ++
if nms.size ≤ threshold then
.trace {cls := `show_me_the_names!} m!"" #[.joinSep (m!""::nms.toList.map (fun n => m!"{.ofConstName n}")) " "]
else m!"")
run_cmd
let threshold := 3
let env ← getEnv
let mods := env.allImportedModuleNames
let repos := mods.foldl (init := (∅ : NameSet)) (·.insert ·.getRoot) |>.toArray.qsort (·.toString < ·.toString)
let mut ls : Lengths := {repos := repos}
let reposToDat ← env.constants.map₁.foldM (init := ls) fun reposToDat n _ => do
if let some rgs ← findDeclarationRanges? n then
let mod := env.getModuleIdxFor? n |>.getD default
let modName := mods[mod]!
let range := rgs.range
let length := range.endPos.line - range.pos.line + 1
return reposToDat.add modName.getRoot length n
else return reposToDat
let mut msg := #[]
let sorted := reposToDat.lengths.map (·.toArray.qsort (·.1 < ·.1))
for (repo, dat) in repos.zip sorted do
msg := msg.push <| Data_toMessageData repo dat threshold
logInfo <| .joinSep msg.toList "\n"
Damiano Testa (Jul 11 2025 at 03:18):
This one is broken down by dependency and is clickable (you can see it in interactive form in the lean server),
Yaël Dillies (Jul 11 2025 at 08:11):
Bhavik Mehta said:
The frequency data, on a log scale, and the sheet to compute it
Looks pretty zipfy !
Last updated: Dec 20 2025 at 21:32 UTC