Preliminary developments for strings #

This file contains the material about strings which we can write down without the results in Init.Data.String.Decode, i.e., without knowing about the bijection between String and List Char given by UTF-8 decoding and encoding.

Note that this file, despite being called Defs, contains quite a few lemmas.

source

@[simp]

theorem List.utf8Encode_nil :

[].utf8Encode = ByteArray.empty

source

theorem List.utf8Encode_singleton {c : Char} :

[c].utf8Encode = (String.utf8EncodeChar c).toByteArray

source

@[simp]

theorem List.utf8Encode_append {l l' : List Char} :

(l ++ l').utf8Encode = l.utf8Encode ++ l'.utf8Encode

source

theorem List.utf8Encode_cons {c : Char} {l : List Char} :

(c :: l).utf8Encode = [c].utf8Encode ++ l.utf8Encode

source

@[simp]

theorem String.utf8EncodeChar_ne_nil {c : Char} :

utf8EncodeChar c ≠ []

source

@[simp]

theorem List.utf8Encode_eq_empty {l : List Char} :

l.utf8Encode = ByteArray.empty ↔ l = []

source

theorem ByteArray.isValidUTF8_utf8Encode {l : List Char} :

l.utf8Encode.IsValidUTF8

source

@[simp]

theorem ByteArray.isValidUTF8_empty :

empty.IsValidUTF8

source

theorem Char.isValidUTF8_toByteArray_utf8EncodeChar {c : Char} :

(String.utf8EncodeChar c).toByteArray.IsValidUTF8

source

theorem ByteArray.IsValidUTF8.append {b b' : ByteArray} (h : b.IsValidUTF8) (h' : b'.IsValidUTF8) :

(b ++ b').IsValidUTF8

source

@[inline]

def String.fromUTF8 (a : ByteArray) (h : a.IsValidUTF8) :

String

Decodes an array of bytes that encode a string as UTF-8 into the corresponding string.

Equations

String.fromUTF8 a h = { toByteArray := a, isValidUTF8 := h }

Instances For

source

@[extern lean_string_to_utf8]

def String.toUTF8 (a : String) :

ByteArray

Encodes a string in UTF-8 as an array of bytes.

Equations

a.toUTF8 = a.toByteArray

Instances For

source

@[simp]

theorem String.toUTF8_eq_toByteArray {s : String} :

s.toUTF8 = s.toByteArray

source

@[simp]

theorem String.toByteArray_empty :

"".toByteArray = ByteArray.empty

source

@[extern lean_string_append]

def String.append (s : String) (t : String) :

String

Appends two strings. Usually accessed via the ++ operator.

The internal implementation will perform destructive updates if the string is not shared.

Examples:

"abc".append "def" = "abcdef"
"abc" ++ "def" = "abcdef"
"" ++ "" = ""

Equations

s.append t = { toByteArray := s.toByteArray ++ t.toByteArray, isValidUTF8 := ⋯ }

Instances For

source

instance instAppendString :

Append String

Equations

instAppendString = { append := fun (s t : String) => s.append t }

source

@[simp]

theorem String.toByteArray_append {s t : String} :

(s ++ t).toByteArray = s.toByteArray ++ t.toByteArray

source

theorem String.toByteArray_inj {s t : String} :

s.toByteArray = t.toByteArray ↔ s = t

source

@[simp]

theorem String.toByteArray_ofList {l : List Char} :

(ofList l).toByteArray = l.utf8Encode

source

@[deprecated String.toByteArray_ofList (since := "2025-10-30")]

theorem List.toByteArray_asString {l : List Char} :

(String.ofList l).toByteArray = l.utf8Encode

source

theorem String.exists_eq_ofList (s : String) :

∃ (l : List Char), s = ofList l

source

@[deprecated String.exists_eq_ofList (since := "2025-10-30")]

theorem String.exists_eq_asString (s : String) :

∃ (l : List Char), s = ofList l

source

@[simp]

theorem String.utf8ByteSize_empty :

"".utf8ByteSize = 0

source

@[simp]

theorem String.utf8ByteSize_append {s t : String} :

(s ++ t).utf8ByteSize = s.utf8ByteSize + t.utf8ByteSize

source

@[simp]

theorem String.size_toByteArray {s : String} :

s.toByteArray.size = s.utf8ByteSize

source

@[simp]

theorem String.toByteArray_push {s : String} {c : Char} :

(s.push c).toByteArray = s.toByteArray ++ [c].utf8Encode

source

def String.rawStartPos (_s : String) :

Pos.Raw

The start position of the string, as a String.Pos.Raw.

Equations

_s.rawStartPos = 0

Instances For

source

@[simp]

theorem String.rawStartPos_eq {s : String} :

s.rawStartPos = 0

source

@[simp]

theorem String.byteIdx_rawEndPos {s : String} :

s.rawEndPos.byteIdx = s.utf8ByteSize

source

@[deprecated String.byteIdx_rawEndPos (since := "2025-10-20")]

theorem String.byteIdx_endPos {s : String} :

s.rawEndPos.byteIdx = s.utf8ByteSize

source

@[simp]

theorem String.utf8ByteSize_ofByteArray {b : ByteArray} {h : b.IsValidUTF8} :

{ toByteArray := b, isValidUTF8 := h }.utf8ByteSize = b.size

source

@[simp]

theorem String.toByteArray_singleton {c : Char} :

(singleton c).toByteArray = [c].utf8Encode

source

theorem String.singleton_eq_ofList {c : Char} :

singleton c = ofList [c]

source

@[deprecated String.singleton_eq_ofList (since := "2025-10-30")]

theorem String.singleton_eq_asString {c : Char} :

singleton c = ofList [c]

source

@[simp]

theorem String.append_singleton {s : String} {c : Char} :

s ++ singleton c = s.push c

source

@[simp]

theorem String.append_left_inj {s₁ s₂ : String} (t : String) :

s₁ ++ t = s₂ ++ t ↔ s₁ = s₂

source

theorem String.append_assoc {s₁ s₂ s₃ : String} :

s₁ ++ s₂ ++ s₃ = s₁ ++ (s₂ ++ s₃)

source

@[simp]

theorem String.utf8ByteSize_eq_zero_iff {s : String} :

s.utf8ByteSize = 0 ↔ s = ""

source

theorem String.rawEndPos_eq_zero_iff {b : String} :

b.rawEndPos = 0 ↔ b = ""

source

@[deprecated String.rawEndPos_eq_zero_iff (since := "2025-10-20")]

theorem String.endPos_eq_zero_iff {b : String} :

b.rawEndPos = 0 ↔ b = ""

source

@[inline]

def String.pushn (s : String) (c : Char) (n : Nat) :

String

Adds multiple repetitions of a character to the end of a string.

Returns s, with n repetitions of c at the end. Internally, the implementation repeatedly calls String.push, so the string is modified in-place if there is a unique reference to it.

Examples:

"indeed".pushn '!' 2 = "indeed!!"
"indeed".pushn '!' 0 = "indeed"
"".pushn ' ' 4 = " "

Equations

s.pushn c n = Nat.repeat (fun (s : String) => s.push c) n s

Instances For

source

theorem String.pushn_eq_repeat_push {s : String} {c : Char} {n : Nat} :

s.pushn c n = Nat.repeat (fun (s : String) => s.push c) n s

source

@[export lean_string_pushn]

def String.Internal.pushnImpl (s : String) (c : Char) (n : Nat) :

String

Equations

String.Internal.pushnImpl s c n = s.pushn c n

Instances For

source

@[inline]

def String.isEmpty (s : String) :

Bool

Checks whether a string is empty.

Empty strings are equal to "" and have length and end position 0.

Examples:

"".isEmpty = true
"empty".isEmpty = false
" ".isEmpty = false

Equations

s.isEmpty = (s.utf8ByteSize == 0)

Instances For

source

@[export lean_string_isempty]

def String.Internal.isEmptyImpl (s : String) :

Bool

Equations

String.Internal.isEmptyImpl s = s.isEmpty

Instances For

source

@[inline]

def String.join (l : List String) :

String

Appends all the strings in a list of strings, in order.

Use String.intercalate to place a separator string between the strings in a list.

Examples:

String.join ["gr", "ee", "n"] = "green"
String.join ["b", "", "l", "", "ue"] = "blue"
String.join [] = ""

Equations

String.join l = List.foldl (fun (r s : String) => r ++ s) "" l

Instances For

source

def String.intercalate (s : String) :

List String → String

Appends the strings in a list of strings, placing the separator s between each pair.

Examples:

", ".intercalate ["red", "green", "blue"] = "red, green, blue"
" and ".intercalate ["tea", "coffee"] = "tea and coffee"
" | ".intercalate ["M", "", "N"] = "M | | N"

Equations

s.intercalate [] = ""
s.intercalate (a :: as) = String.intercalate.go✝ a s as

Instances For

source

@[export lean_string_intercalate]

def String.Internal.intercalateImpl (s : String) :

List String → String

Equations

String.Internal.intercalateImpl s = s.intercalate

Instances For

source

structure String.Pos.Raw.IsValid (s : String) (off : Raw) :

Prop

Predicate for validity of positions inside a String.

There are multiple equivalent definitions for validity.

We say that a position is valid if the string obtained by taking all of the bytes up to, but excluding, the given position, is valid UTF-8; see Pos.isValid_iff_isValidUTF8_extract_zero.

Similarly, a position is valid if the string obtained by taking all of the bytes starting at the given position is valid UTF-8; see Pos.isValid_iff_isValidUTF8_extract_utf8ByteSize.

An equivalent condition is that the position is the length of the UTF-8 encoding of some prefix of the characters of the string; see Pos.isValid_iff_exists_append and Pos.isValid_iff_exists_take_data.

Another equivalent condition that can be checked efficiently is that the position is either the end position or strictly smaller than the end position and the byte at the position satisfies UInt8.IsUTF8FirstByte; see Pos.isValid_iff_isUTF8FirstByte.

Examples:

String.Pos.IsValid "abc" ⟨0⟩
String.Pos.IsValid "abc" ⟨1⟩
String.Pos.IsValid "abc" ⟨3⟩
¬ String.Pos.IsValid "abc" ⟨4⟩
String.Pos.IsValid "𝒫(A)" ⟨0⟩
¬ String.Pos.IsValid "𝒫(A)" ⟨1⟩
¬ String.Pos.IsValid "𝒫(A)" ⟨2⟩
¬ String.Pos.IsValid "𝒫(A)" ⟨3⟩
String.Pos.IsValid "𝒫(A)" ⟨4⟩

le_rawEndPos : off ≤ s.rawEndPos
isValidUTF8_extract_zero : (s.toByteArray.extract 0 off.byteIdx).IsValidUTF8

Instances For

source

theorem String.Pos.Raw.IsValid.le_utf8ByteSize {s : String} {off : Raw} (h : IsValid s off) :

off.byteIdx ≤ s.utf8ByteSize

source

theorem String.Pos.Raw.isValid_iff_isValidUTF8_extract_zero {s : String} {p : Raw} :

IsValid s p ↔ p ≤ s.rawEndPos ∧ (s.toByteArray.extract 0 p.byteIdx).IsValidUTF8

source

@[deprecated String.Pos.Raw.IsValid.le_rawEndPos (since := "2025-10-20")]

theorem String.Pos.Raw.IsValid.le_endPos {s : String} {off : Raw} (h : IsValid s off) :

off ≤ s.rawEndPos

source

@[simp]

theorem String.Pos.Raw.isValid_zero {s : String} :

IsValid s 0

source

@[simp]

theorem String.Pos.Raw.isValid_rawEndPos {s : String} :

IsValid s s.rawEndPos

source

theorem String.Pos.Raw.isValid_of_eq_rawEndPos {s : String} {p : Raw} (h : p = s.rawEndPos) :

IsValid s p

source

@[simp]

theorem String.Pos.Raw.isValid_empty_iff {p : Raw} :

IsValid "" p ↔ p = 0

source

structure String.Pos (s : String) :

Type

A Pos s is a byte offset in s together with a proof that this position is at a UTF-8 character boundary.

offset : Raw
The underlying byte offset of the Pos.
isValid : Raw.IsValid s self.offset
The proof that offset is valid for the string s.

Instances For

source

theorem String.Pos.ext_iff {s : String} {x y : s.Pos} :

x = y ↔ x.offset = y.offset

source

theorem String.Pos.ext {s : String} {x y : s.Pos} (offset : x.offset = y.offset) :

x = y

source

instance String.instDecidableEqPos {s✝ : String} :

DecidableEq s✝.Pos

Equations

String.instDecidableEqPos = String.instDecidableEqPos.decEq

source

def String.instDecidableEqPos.decEq {s✝ : String} (x✝ x✝¹ : s✝.Pos) :

Decidable (x✝ = x✝¹)

Equations

One or more equations did not get rendered due to their size.

Instances For

source

@[inline]

def String.startPos (s : String) :

s.Pos

The start position of s, as an s.Pos.

Equations

s.startPos = { offset := 0, isValid := ⋯ }

Instances For

source

@[simp]

theorem String.offset_startPos {s : String} :

s.startPos.offset = 0

source

instance String.instInhabitedPos {s : String} :

Inhabited s.Pos

Equations

String.instInhabitedPos = { default := s.startPos }

source

@[inline]

def String.endPos (s : String) :

s.Pos

The past-the-end position of s, as an s.Pos.

Equations

s.endPos = { offset := s.rawEndPos, isValid := ⋯ }

Instances For

source

@[simp]

theorem String.offset_endPos {s : String} :

s.endPos.offset = s.rawEndPos

source

instance String.instLEPos {s : String} :

LE s.Pos

Equations

String.instLEPos = { le := fun (l r : s.Pos) => l.offset ≤ r.offset }

source

instance String.instLTPos {s : String} :

LT s.Pos

Equations

String.instLTPos = { lt := fun (l r : s.Pos) => l.offset < r.offset }

source

theorem String.Pos.le_iff {s : String} {l r : s.Pos} :

l ≤ r ↔ l.offset ≤ r.offset

source

theorem String.Pos.lt_iff {s : String} {l r : s.Pos} :

l < r ↔ l.offset < r.offset

source

instance String.instDecidableLePos {s : String} (l r : s.Pos) :

Decidable (l ≤ r)

Equations

String.instDecidableLePos l r = decidable_of_iff' (l.offset ≤ r.offset) ⋯

source

instance String.instDecidableLtPos {s : String} (l r : s.Pos) :

Decidable (l < r)

Equations

String.instDecidableLtPos l r = decidable_of_iff' (l.offset < r.offset) ⋯

source

structure String.Slice :

Type

A region or slice of some underlying string.

A slice consists of a string together with the start and end byte positions of a region of interest. Actually extracting a substring requires copying and memory allocation, while many slices of the same underlying string may exist with very little overhead. While this could be achieved by tracking the bounds by hand, the slice API is much more convenient.

String.Slice bundles proofs to ensure that the start and end positions always delineate a valid string. For this reason, it should be preferred over Substring.Raw.

str : String
The underlying strings.
startInclusive : self.str.Pos
The byte position of the start of the string slice.
endExclusive : self.str.Pos
The byte position of the end of the string slice.
startInclusive_le_endExclusive : self.startInclusive ≤ self.endExclusive
The slice is not degenerate (but it may be empty).

Instances For

source

instance String.instInhabitedSlice :

Inhabited Slice

Equations

One or more equations did not get rendered due to their size.

source

@[inline]

def String.toSlice (s : String) :

Slice

Returns a slice that contains the entire string.

Equations

s.toSlice = { str := s, startInclusive := s.startPos, endExclusive := s.endPos, startInclusive_le_endExclusive := ⋯ }

Instances For

source

instance String.instCoeSlice :

Coe String Slice

Equations

String.instCoeSlice = { coe := String.toSlice }

source

@[simp]

theorem String.startInclusive_toSlice {s : String} :

s.toSlice.startInclusive = s.startPos

source

@[simp]

theorem String.endExclusive_toSlice {s : String} :

s.toSlice.endExclusive = s.endPos

source

@[simp]

theorem String.str_toSlice {s : String} :

s.toSlice.str = s

source

@[inline]

def String.Slice.utf8ByteSize (s : Slice) :

Nat

The number of bytes of the UTF-8 encoding of the string slice.

Equations

s.utf8ByteSize = s.startInclusive.offset.byteDistance s.endExclusive.offset

Instances For

source

theorem String.Slice.utf8ByteSize_eq {s : Slice} :

s.utf8ByteSize = s.endExclusive.offset.byteIdx - s.startInclusive.offset.byteIdx

source

instance String.instHAddRawSlice :

HAdd Pos.Raw Slice Pos.Raw

Equations

String.instHAddRawSlice = { hAdd := fun (p : String.Pos.Raw) (s : String.Slice) => { byteIdx := p.byteIdx + s.utf8ByteSize } }

source

instance String.instHAddSliceRaw :

HAdd Slice Pos.Raw Pos.Raw

Equations

String.instHAddSliceRaw = { hAdd := fun (s : String.Slice) (p : String.Pos.Raw) => { byteIdx := s.utf8ByteSize + p.byteIdx } }

source

instance String.instHSubRawSlice :

HSub Pos.Raw Slice Pos.Raw

Equations

String.instHSubRawSlice = { hSub := fun (p : String.Pos.Raw) (s : String.Slice) => { byteIdx := p.byteIdx - s.utf8ByteSize } }

source

@[simp]

theorem String.Pos.Raw.byteIdx_add_slide {p : Raw} {s : Slice} :

(p + s).byteIdx = p.byteIdx + s.utf8ByteSize

source

@[simp]

theorem String.Pos.Raw.byteIdx_slice_add {s : Slice} {p : Raw} :

(s + p).byteIdx = s.utf8ByteSize + p.byteIdx

source

@[simp]

theorem String.Pos.Raw.byteIdx_sub_slice {p : Raw} {s : Slice} :

(p - s).byteIdx = p.byteIdx - s.utf8ByteSize

source

@[inline]

def String.Slice.rawEndPos (s : Slice) :

Pos.Raw

The end position of a slice, as a Pos.Raw.

Equations

s.rawEndPos = { byteIdx := s.utf8ByteSize }

Instances For

source

@[simp]

theorem String.Slice.byteIdx_rawEndPos {s : Slice} :

s.rawEndPos.byteIdx = s.utf8ByteSize

source

structure String.Pos.Raw.IsValidForSlice (s : Slice) (p : Raw) :

Prop

Criterion for validity of positions in string slices.

le_rawEndPos : p ≤ s.rawEndPos
isValid_offsetBy : IsValid s.str (p.offsetBy s.startInclusive.offset)

Instances For

source

theorem String.Pos.Raw.IsValidForSlice.le_utf8ByteSize {s : Slice} {p : Raw} (h : IsValidForSlice s p) :

p.byteIdx ≤ s.utf8ByteSize

source

@[inline]

def String.Slice.getUTF8Byte (s : Slice) (p : Pos.Raw) (h : p < s.rawEndPos) :

UInt8

Accesses the indicated byte in the UTF-8 encoding of a string slice.

At runtime, this function is implemented by efficient, constant-time code.

Equations

s.getUTF8Byte p h = s.str.getUTF8Byte (p.offsetBy s.startInclusive.offset) ⋯

Instances For

source

def String.Slice.getUTF8Byte! (s : Slice) (p : Pos.Raw) :

UInt8

Accesses the indicated byte in the UTF-8 encoding of the string slice, or panics if the position is out-of-bounds.

Equations

One or more equations did not get rendered due to their size.

Instances For

source

structure String.Slice.Pos (s : Slice) :

Type

A Slice.Pos s is a byte offset in s together with a proof that this position is at a UTF-8 character boundary.

offset : Pos.Raw
The underlying byte offset of the Slice.Pos.
isValidForSlice : Pos.Raw.IsValidForSlice s self.offset
The proof that offset is valid for the string slice s.