Zulip Chat Archive

Stream: mathlib4

Topic: Lengths of theorems

Kenny Lau (Jul 10 2025 at 23:49):

I wrote a script to count the number of lines of theorems and defs in Mathlib.

The current longest theorem is 213 lines in engel_isBot_of_isMin in Mathlib/Algebra/Lie/CartanExists.lean#L130.

Kenny Lau (Jul 10 2025 at 23:49):

Python script:

# Put me in a folder in mathlib4
# Add that folder to .git/info/exclude

import os
import re
import sys

freq = dict()
longestTen = []
warn = open("line-count-warnings.txt", "w")

def record(filename, start, count):
    global freq, longestTen
    if count in freq:
        freq[count] += 1
    else:
        freq[count] = 1
    if not longestTen or longestTen[0][0] < count:
        longestTen.append((count,filename,start))
        longestTen.sort(key=lambda t:t[0])
        longestTen = longestTen[-10:]

def process(filename, file):
    global warn
    i = 0
    lineCount = 0
    start = 0
    state = 0
    for line in file.split("\n"):
        i += 1
        if line[2:] == "--":
            pass
        elif state == 0: # Outside
            if re.search(":=( (by|calc))?$", line):
                state = 1
                lineCount = 0
                start = i
        elif state == 1: # Inside theorem/def
            if line[:2] == "  ":
                lineCount += 1
            else: # theorem/def ended
                record(filename, start, lineCount)
                lineCount = 0
                if line != "":
                    warn.write("%d %s\n"%(i,filename)) # Warning: theorem/def ended but not newline
                state = 0


fileCount = 0
for root, dirs, files in os.walk('../Mathlib'):
    for file in files:
        filename = os.path.join(root, file)
        with open(filename, "r", encoding="UTF-8") as auto:
            fileCount += 1
            if fileCount % 100 == 0:
                sys.stdout.write("\r%d files processed."%fileCount)
                sys.stdout.flush()
            process(filename, auto.read())

warn.close()

print("\r%d files processed." % fileCount)
print("%d theorems/defs processed." % sum(freq[i] for i in freq))
print()

keys = freq.keys()
maxKeys = max(keys) # What if `keys` is empty?
log10 = len(str(maxKeys))

for i in range(maxKeys+1):
    print("%s: %d"%(str(i).rjust(log10," "), freq.get(i,0)))
print()

print("Longest ten theorems/defs:")
for t in longestTen:
    print("%d lines: Line %d in %s"%(t[0], t[2], t[1]))

Kenny Lau (Jul 10 2025 at 23:50):

Output:

6607 files processed.
166886 theorems/defs processed.

  0: 187
  1: 96891
  2: 20776
  3: 12312
  4: 7433
  5: 5690
  6: 3957
  7: 3273
  8: 2466
  9: 1881
 10: 1636
 11: 1352
 12: 1103
 13: 936
 14: 787
 15: 669
 16: 571
 17: 546
 18: 454
 19: 364
 20: 319
 21: 306
 22: 252
 23: 223
 24: 206
 25: 183
 26: 169
 27: 148
 28: 152
 29: 134
 30: 126
 31: 100
 32: 86
 33: 76
 34: 56
 35: 63
 36: 65
 37: 52
 38: 38
 39: 58
 40: 50
 41: 40
 42: 39
 43: 45
 44: 35
 45: 25
 46: 30
 47: 34
 48: 34
 49: 18
 50: 18
 51: 20
 52: 16
 53: 14
 54: 9
 55: 11
 56: 12
 57: 7
 58: 10
 59: 18
 60: 9
 61: 12
 62: 12
 63: 10
 64: 12
 65: 12
 66: 8
 67: 14
 68: 9
 69: 13
 70: 8
 71: 13
 72: 5
 73: 4
 74: 10
 75: 5
 76: 4
 77: 5
 78: 3
 79: 3
 80: 5
 81: 6
 82: 5
 83: 6
 84: 3
 85: 3
 86: 4
 87: 6
 88: 3
 89: 4
 90: 3
 91: 3
 92: 5
 93: 2
 94: 3
 95: 3
 96: 1
 97: 1
 98: 3
 99: 3
100: 6
101: 6
102: 1
103: 4
104: 0
105: 0
106: 1
107: 3
108: 2
109: 0
110: 2
111: 0
112: 0
113: 2
114: 1
115: 1
116: 1
117: 0
118: 1
119: 0
120: 2
121: 2
122: 1
123: 0
124: 1
125: 2
126: 0
127: 4
128: 1
129: 1
130: 0
131: 0
132: 0
133: 2
134: 0
135: 1
136: 0
137: 0
138: 1
139: 0
140: 0
141: 1
142: 2
143: 0
144: 0
145: 0
146: 0
147: 0
148: 2
149: 1
150: 1
151: 1
152: 0
153: 0
154: 1
155: 0
156: 0
157: 1
158: 0
159: 1
160: 0
161: 0
162: 0
163: 0
164: 0
165: 0
166: 0
167: 0
168: 0
169: 0
170: 0
171: 0
172: 0
173: 0
174: 0
175: 0
176: 0
177: 0
178: 0
179: 0
180: 0
181: 1
182: 0
183: 0
184: 0
185: 0
186: 0
187: 0
188: 0
189: 0
190: 0
191: 0
192: 0
193: 0
194: 0
195: 0
196: 0
197: 0
198: 0
199: 0
200: 0
201: 0
202: 0
203: 0
204: 0
205: 0
206: 0
207: 0
208: 0
209: 0
210: 0
211: 0
212: 0
213: 1

Kenny Lau (Jul 10 2025 at 23:50):

Longest ten theorems/defs:
148 lines: Line 250 in ../Mathlib\MeasureTheory\Covering\Vitali.lean
148 lines: Line 673 in ../Mathlib\SetTheory\Game\Basic.lean
149 lines: Line 174 in ../Mathlib\MeasureTheory\Integral\RieszMarkovKakutani\Real.lean
150 lines: Line 30 in ../Mathlib\Analysis\SpecificLimits\FloorPow.lean
151 lines: Line 745 in ../Mathlib\Topology\MetricSpace\GromovHausdorff.lean
154 lines: Line 157 in ../Mathlib\CategoryTheory\Limits\FilteredColimitCommutesFiniteLimit.lean
157 lines: Line 435 in ../Mathlib\Algebra\Lie\Weights\RootSystem.lean
159 lines: Line 859 in ../Mathlib\MeasureTheory\Covering\Besicovitch.lean
181 lines: Line 196 in ../Mathlib\MeasureTheory\Integral\Layercake.lean
213 lines: Line 130 in ../Mathlib\Algebra\Lie\CartanExists.lean

Kenny Lau (Jul 10 2025 at 23:52):

cc @Bhavik Mehta

Kenny Lau (Jul 10 2025 at 23:53):

so 58.1% of the theorems are one-liners

Kenny Lau (Jul 10 2025 at 23:57):

N = 166886
mean = 3.29553
stdev = 6.13141

N = sum(freq[i] for i in keys)
E = sum(i*freq[i] for i in keys)/N
E2 = sum(i*i*freq[i] for i in keys)/N
s = (E2-E*E)**.5
print("N = %d"%N)
print("mean = %.5f"%E)
print("stdev = %.5f"%s)

Bhavik Mehta (Jul 11 2025 at 00:04):

The frequency data, on a log scale, and the sheet to compute it

Damiano Testa (Jul 11 2025 at 02:13):

I have already seen this list, but I can't find the topic...

Damiano Testa (Jul 11 2025 at 03:05):

I had not posted it maybe, but this is what I had:

import Mathlib

open Lean Elab Command

/--
info: (424, 1) #[Lean.Language.Lean.process]
(354, 1) #[Lean.Compiler.LCNF.ToLCNF.toLCNF]
(311, 1) #[Mathlib.Linter.DirectoryDependency.forbiddenImportDirs]
(273, 1) #[Lean.Elab.Tactic.BVDecide.Frontend.ReifiedBVExpr.of]
(271, 1) #[Lean.Parser.Tactic.grind]
(266, 1) #[Std.Tactic.BVDecide.LRAT.Internal.DefaultFormula.insertUnitInvariant_insertUnit]
(241, 2) #[_aux_Init_Data_UInt_Lemmas___macroRules_commandDeclare_uint_theorems___1, commandDeclare_uint_theorems__]
(240, 1) #[Lean.Tactic.FunInd.deriveInductionStructural]
(227, 1) #[Lean.Tactic.FunInd.deriveInductionStructural.doRealize]
(222, 2) #[LieAlgebra.engel_isBot_of_isMin, _private.Lean.Meta.Match.MatchEqs.0.Lean.Meta.Match.mkSplitterProof]
(219, 2) #[Lean.Expr, Lean.Expr.data]
(215, 1) #[Lean.Elab.Tactic.Omega.asLinearComboImpl]
(214, 1) #[ToAdditive.to_additive]
(208, 1) #[Std.Tactic.BVDecide.BVExpr.bitblast.go_denote_eq]
(207, 1) #[Std.Tactic.BVDecide.LRAT.Internal.DefaultFormula.confirmRupHint_preserves_invariant_helper]
(206, 1) #[Lean.Elab.Term.CollectPatternVars.collect]
(201, 1) #[Lean.Elab.Tactic.ElimApp.evalAlts]
(199, 1) #[Lean.Meta.Simp.simpHaveTelescope]
(198, 1) #[Vitali.exists_disjoint_covering_ae]
(196, 1) #[MeasureTheory.lintegral_comp_eq_lintegral_meas_le_mul_of_measurable]
(194, 1) #[Lean.Language.Lean.process.parseCmd]
(193, 1) #[Lean.Server.Test.Runner.main]
(192, 1) #[Mathlib.Tactic.Monoidal.instMkEvalHorizontalCompMonoidalM]
(189, 1) #[Lean.Meta.mkRichHCongr]
(188, 2) #[Std.Tactic.BVDecide.BVExpr.bitblast, _private.Lean.Elab.Quotation.0.Lean.Elab.Term.Quotation.getHeadInfo]
(176, 2) #[Mathlib.Notation3._aux_Mathlib_Util_Notation3___elabRules_Mathlib_Notation3_notation3_1, Mathlib.Notation3.notation3]
(174, 1) #[Std.Tactic.BVDecide.BVExpr.bitblast.go]
(171, 1) #[Lean.Meta.MatcherApp.transform]
(169, 1) #[Lean.Elab.Term.elabMutualDef]
(167, 2) #[Besicovitch.exists_closedBall_covering_tsum_measure_le, Lean.Meta.IndPredBelow.mkBelowMatcher]
(166, 1) #[Lean.Meta.Hint.readableDiff]
(164, 1) #[Lean.resolveLocalName]
(162, 2) #[Lean.IR.ToIR.lowerLet, Mathlib.Tactic.CC.CCM.addEqvStep]
(161, 2) #[Lean.Elab.Tactic.BVDecide.Frontend.ReifiedBVExpr.of.go, LieAlgebra.IsKilling.eq_top_of_invtSubmodule_ne_bot]
(160, 3) #[GromovHausdorff.totallyBounded, Lean.Elab.Term.toParserDescr, Lean.Meta.Grind.Arith.Linear.getStructId?]
(159, 1) #[CategoryTheory.Limits.colimitLimitToLimitColimit_surjective]
(157, 1) #[tendsto_div_of_monotone_of_exists_subseq_tendsto_div]
(156, 1) #[Mathlib.Tactic.Monoidal.instMonadMor₂MonoidalM]
(154, 2) #[Lean.Tactic.FunInd.buildInductionBody, Lean.Tactic.FunInd.foldAndCollect]
(153, 1) #[Mathlib.Tactic.Bicategory.instMonadMor₂BicategoryM]
(152, 3) #[Lean.Elab.PartialFixpoint.deriveInduction, Mathlib.Tactic.TermCongr.mkCongrOf, _private.Mathlib.MeasureTheory.Integral.RieszMarkovKakutani.Real.0.RealRMK.integral_riesz_aux]
(151, 3) #[Aesop.addRappUnsafe, Std.Time.Modifier, _private.Mathlib.RingTheory.Spectrum.Prime.ChevalleyComplexity.0.ChevalleyThm.PolynomialC.induction_aux]
(150, 3) #[Lean.Elab.Tactic.BVDecide.Frontend.Normalize.isSupportedMatch, Lean.Meta.Grind.Arith.Linear.getStructId?.go?, Lean.Meta.addPPExplicitToExposeDiff]
(149, 2) #[Lean.Meta.ACLt.main, SetTheory.PGame.quot_mul_assoc]
(148, 1) #[MeasureTheory.measurableSet_range_of_continuous_injective]
(147, 4)
(146, 1) #[AkraBazziRecurrence.smoothingFn_mul_asympBound_isBigO_T]
(145, 1) #[Lean.Compiler.LCNF.Simp.simpJpCases?]
(144, 2) #[Lean.Elab.partialFixpoint, VitaliFamily.exists_measurable_supersets_limRatio]
(143, 3) #[Lean.PrettyPrinter.Delaborator.TopDownAnalyze.analyzeAppStagedCore, Lean.Server.FileWorker.locationLinksOfInfo, eVariationOn.add_point]
(142, 1) #[Besicovitch.exists_disjoint_closedBall_covering_ae_of_finiteMeasure_aux]
(140, 2) #[AlgebraicGeometry.Proj.valuativeCriterion_existence_aux, Mathlib.Tactic.ITauto.applyProof]
(138, 2) #[GromovHausdorff.hausdorffDist_optimal, MeasureTheory.hasFDerivAt_convolution_right_with_param]
(137, 1) #[exists_closed_cover_approximatesLinearOn_of_hasFDerivWithinAt]
(136, 3) #[ApproximatesLinearOn.surjOn_closedBall_of_nonlinearRightInverse, FDerivMeasurableAux.D_subset_differentiable_set, Lean.Meta.Simp.Config]
(135, 4)
(134, 1) #[MultilinearMap.map_sum_finset_aux]
(132, 4)
(131, 2) #[Lean.Compiler.LCNF.Simp.simp, Submodule.basis_of_pid_aux]
(130, 2) #[ContMDiffWithinAt.mfderivWithin, Mathlib.Tactic.Bicategory.instMkEvalWhiskerRightBicategoryM]
(129, 3) #[Aesop.Options, Besicovitch.exist_finset_disjoint_balls_large_measure, Mathlib.Tactic.CC.CCM.addEqvStep.go]
(128, 3) #[AddCircle.addWellApproximable_ae_empty_or_univ, MeasureTheory.hahn_decomposition, _private.Lean.Elab.Inductive.0.Lean.Elab.Command.elabCtors]
(127, 2) #[Mathlib.Tactic.Bicategory.instMonadNormalizeNaturalityBicategoryM, _private.Lean.Data.FuzzyMatching.0.Lean.FuzzyMatching.fuzzyMatchCore]
(126, 6)
(125, 2) #[EMetric.instParacompactSpace, Lean.Meta.Simp.simpHaveTelescope.simpHaveTelescopeAux]
(124, 1) #[_private.Mathlib.RingTheory.Spectrum.Prime.ChevalleyComplexity.0.ChevalleyThm.PolynomialC.statement]
(123, 1) #[Std.Tactic.BVDecide.LRAT.Internal.DefaultFormula.assignmentsInvariant_insertRupUnits_of_assignmentsInvariant]
(122, 5)
(121, 2) #[Lean.Elab.Tactic.BVDecide.Frontend.Normalize.intToBitVecPass, Lean.Parser.Command.variable]
(120, 2) #[MeasureTheory.eLpNorm_le_eLpNorm_fderiv_of_eq_inner, _private.Lean.Server.FileWorker.0.Lean.Server.FileWorker.reportSnapshots]
(119, 6)
(118, 2) #[Besicovitch.TauPackage.color_lt, _private.Std.Time.Format.Basic.0.Std.Time.formatWith]
(117, 4)
(116, 4)
(115, 4)
(114, 3) #[Lean.Meta.Simp.tryAutoCongrTheorem?, Lean.Parser.Command.open, _private.Mathlib.Combinatorics.SimpleGraph.Tutte.0.SimpleGraph.tutte_exists_isPerfectMatching_of_near_matchings]
(113, 11)
(112, 4)
(111, 4)
(110, 3) #[Lean.Elab.Term.elabLetDeclAux, Lean.Tactic.FunInd.deriveUnaryInduction, _private.Std.Time.Format.Basic.0.Std.Time.parseWith]
(109, 4)
(108, 5)
(107, 2) #[Lean.Meta.unifyEq?, MeasureTheory.GridLines.T_insert_le_T_lmarginal_singleton]
(106, 7)
(105, 4)
(104, 10)
(103, 6)
(102, 4)
(101, 6)
(100, 5)
(99, 7)
(98, 6)
(97, 4)
(96, 11)
(95, 8)
(94, 6)
(93, 4)
(92, 8)
(91, 7)
(90, 10)
(89, 8)
(88, 17)
(87, 4)
(86, 9)
(85, 10)
(84, 8)
(83, 14)
(82, 12)
(81, 11)
(80, 13)
(79, 14)
(78, 15)
(77, 15)
(76, 22)
(75, 23)
(74, 14)
(73, 20)
(72, 26)
(71, 24)
(70, 27)
(69, 22)
(68, 20)
(67, 25)
(66, 21)
(65, 36)
(64, 27)
(63, 26)
(62, 33)
(61, 27)
(60, 31)
(59, 34)
(58, 39)
(57, 34)
(56, 29)
(55, 44)
(54, 40)
(53, 43)
(52, 51)
(51, 51)
(50, 62)
(49, 65)
(48, 87)
(47, 71)
(46, 84)
(45, 103)
(44, 79)
(43, 97)
(42, 102)
(41, 114)
(40, 124)
(39, 148)
(38, 149)
(37, 157)
(36, 183)
(35, 193)
(34, 207)
(33, 242)
(32, 275)
(31, 294)
(30, 280)
(29, 352)
(28, 369)
(27, 428)
(26, 428)
(25, 502)
(24, 576)
(23, 662)
(22, 716)
(21, 820)
(20, 955)
(19, 1101)
(18, 1270)
(17, 1537)
(16, 1724)
(15, 2030)
(14, 2362)
(13, 2964)
(12, 3516)
(11, 4406)
(10, 5364)
(9, 6915)
(8, 9206)
(7, 12304)
(6, 16900)
(5, 24863)
(4, 41164)
(3, 65002)
(2, 56790)
(1, 76074)
-/
#guard_msgs in
run_cmd
  let env ← getEnv
  let csts := env.constants.map₁
  let mut lengthToNames : Std.HashMap Nat NameSet := ∅
  for (nm, _) in csts do
    if let some rg := declRangeExt.find? env nm then
      lengthToNames := lengthToNames.alter (rg.range.endPos.line - rg.range.pos.line + 1) (·.getD ∅ |>.insert nm)
  for (l, nms) in lengthToNames.toArray.qsort (·.1 > ·.1) do
    dbg_trace "{(l, nms.size)}{if nms.size ≤ 3 then s!" {nms.toArray.qsort (·.toString < ·.toString)}" else ""}"

Damiano Testa (Jul 11 2025 at 03:06):

It shows the number of lines, followed by how many declarations of that length there are. When there are at most 3 declarations, it also prints their names.

Kenny Lau (Jul 11 2025 at 03:12):

@Damiano Testa could you share its output?

Damiano Testa (Jul 11 2025 at 03:13):

Sure, I updated the message above!

Damiano Testa (Jul 11 2025 at 03:13):

Note that the list above includes all declarations that are actually written somewhere and also includes the docstrings.

Damiano Testa (Jul 11 2025 at 03:16):

Ah, now with the full name, I found the message!

import Mathlib

open Lean Elab Command

structure Lengths where
  repos : Array Name
  lengths : Array (Std.HashMap Nat (Array Name)) := List.replicate repos.size ∅ |>.toArray

def Lengths.add (ls : Lengths) (repo : Name) (l : Nat) (decl : Name) : Lengths :=
  if let some idx := ls.repos.findIdx? (· == repo) then
    {ls with lengths := ls.lengths.modify idx (·.alter l (·.getD #[] |>.push decl))}
  else
    ls

def Data_toMessageData (repo : Name) (dat : Array (Nat × (Array Name))) (threshold : Nat) : MessageData :=
  let tot := dat.foldl (init := 0) fun t (_, names) => t + names.size
  let min := dat[0]!.1
  let header := m!"{tot} decls, {dat.size} lengths between {min}-{dat.back!.1}"
  .trace {cls := repo} header <| #[m!"Length: No of decls"] ++ (dat.map fun (lth, nms) => m!"\nlength {lth}: decls {nms.size}" ++
    if nms.size ≤ threshold then
      .trace {cls := `show_me_the_names!} m!"" #[.joinSep (m!""::nms.toList.map (fun n => m!"{.ofConstName n}")) " "]
    else m!"")

run_cmd
  let threshold := 3
  let env ← getEnv
  let mods := env.allImportedModuleNames
  let repos := mods.foldl (init := (∅ : NameSet)) (·.insert ·.getRoot) |>.toArray.qsort (·.toString < ·.toString)
  let mut ls : Lengths := {repos := repos}
  let reposToDat ← env.constants.map₁.foldM (init := ls) fun reposToDat n _ => do
    if let some rgs ← findDeclarationRanges? n then
      let mod := env.getModuleIdxFor? n |>.getD default
      let modName := mods[mod]!
      let range := rgs.range
      let length := range.endPos.line - range.pos.line + 1
      return reposToDat.add modName.getRoot length n
    else return reposToDat
  let mut msg := #[]
  let sorted := reposToDat.lengths.map (·.toArray.qsort (·.1 < ·.1))
  for (repo, dat) in repos.zip sorted do
      msg := msg.push <| Data_toMessageData repo dat threshold
  logInfo <| .joinSep msg.toList "\n"

Damiano Testa (Jul 11 2025 at 03:18):

This one is broken down by dependency and is clickable (you can see it in interactive form in the lean server),

Yaël Dillies (Jul 11 2025 at 08:11):

Bhavik Mehta said:

The frequency data, on a log scale, and the sheet to compute it

Looks pretty zipfy !

Last updated: Feb 28 2026 at 14:05 UTC