Zulip Chat Archive
Stream: lean4
Topic: Short array optimization experiment
Tomas Skrivan (Jul 10 2022 at 20:15):
I have defined NFloatArray n
which is just a FloatArray
with size in the type but it has a specialization for small numbers of n
. I'm curious what is the best way to work with such structure.
I have defined different versions of addOne
, a function adding one to every component.
As a test, I run this function 10milion times for n=2
. The times are:
addOne_v1 457ms
addOne_v2 138ms
addOne_v3 84.1ms
addOne_v4 37.7ms
addOne_v5 34.1ms
Can anyone get it even faster? Why inlining addOne_v3
makes it roughly twice as slow? Every other version of addOne
benefits from inlining.
File: NFloatArray.lean
structure Float2 where
x : Float
y : Float
def NFloatArray (n : Nat) : Type :=
match n with
| 0 => Unit
| 1 => Float
| 2 => Float2
| n'+3 => {xs : FloatArray // xs.size = n'+3}
instance : ToString (NFloatArray n) :=
⟨λ xs =>
match n, xs with
| 0, _ => "[]"
| 1, x => let x : Float := x; s!"[{x}]"
| 2, ⟨x,y⟩ => s!"[{x}, {y}]"
| _+3, ⟨xs,_⟩ => toString xs⟩
def Array.toFloatArray (xs : Array Float) : FloatArray := ⟨xs⟩
namespace NFloatArray
@[inline]
def iget (xs : NFloatArray n) (i : Fin n) : Float :=
match n, xs with
| 0, _ => 0
| 1, x => x
| 2, ⟨x,y⟩ =>
match i with
| ⟨0,_⟩ => x
| ⟨1,_⟩ => y
| _+3, ⟨xs',h⟩ => xs'.get (h ▸ i)
def get (xs : NFloatArray n) (i : Fin n) : Float :=
match n, xs with
| 0, _ => 0
| 1, x => x
| 2, ⟨x,y⟩ =>
match i with
| ⟨0,_⟩ => x
| ⟨1,_⟩ => y
| _+3, ⟨xs',h⟩ => xs'.get (h ▸ i)
set_option trace.compiler.ir.result true in
@[inline]
def addOne_v1 {n} (xs : NFloatArray n) : NFloatArray n :=
match n, xs with
| 0, _ => Unit.unit
| 1, x => let x : Float := x; (x + 1 : Float)
| 2, xs' => ⟨xs'.get 0 + 1, xs'.get 1 + 1⟩
| _+3, ⟨xs',h⟩ => ⟨xs',h⟩ -- TODO: implement me
set_option trace.compiler.ir.result true in
@[inline]
def addOne_v2 {n} (xs : NFloatArray n) : NFloatArray n :=
match n, xs with
| 0, _ => Unit.unit
| 1, x => let x : Float := x; (x + 1 : Float)
| 2, xs' => ⟨xs'.1 + 1, xs'.2 + 1⟩
| _+3, ⟨xs',h⟩ => ⟨xs',h⟩ -- TODO: implement me
set_option trace.compiler.ir.result true in
def addOne_v3 {n} (xs : NFloatArray n) : NFloatArray n :=
match n, xs with
| 0, _ => Unit.unit
| 1, x => let x : Float := x; (x + 1 : Float)
| 2, xs' => ⟨xs'.iget 0 + 1, xs'.iget 1 + 1⟩
| _+3, ⟨xs',h⟩ => ⟨xs',h⟩ -- TODO: implement me
set_option trace.compiler.ir.result true in
@[inline]
def addOne_v4 {n} (xs : NFloatArray n) : NFloatArray n :=
match n, xs with
| 0, _ => Unit.unit
| 1, x => let x : Float := x; (x + 1 : Float)
| 2, ⟨x,y⟩ => ⟨x+1, y+1⟩
| _+3, ⟨xs',h⟩ => ⟨xs',h⟩ -- TODO: implement me
set_option trace.compiler.ir.result true in
@[inline]
def addOne_v5 : NFloatArray 2 → NFloatArray 2 :=
λ ⟨x,y⟩ => ⟨x+1, y+1⟩
end NFloatArray
File: Main.lean
import NFloatArray
open NFloatArray
def main : IO Unit := do
timeit "addOne_v1" $ do
let mut v : NFloatArray 2 := ⟨1,2⟩
for i in [0:10000000] do
v := addOne_v1 v
timeit "addOne_v2" $ do
let mut v : NFloatArray 2 := ⟨1,2⟩
for i in [0:10000000] do
v := addOne_v2 v
timeit "addOne_v3" $ do
let mut v : NFloatArray 2 := ⟨1,2⟩
for i in [0:10000000] do
v := addOne_v3 v
timeit "addOne_v4" $ do
let mut v : NFloatArray 2 := ⟨1,2⟩
for i in [0:10000000] do
v := addOne_v4 v
timeit "addOne_v5" $ do
let mut v : NFloatArray 2 := ⟨1,2⟩
for i in [0:10000000] do
v := addOne_v5 v
Tomas Skrivan (Jul 10 2022 at 20:24):
Adding -O2
flag via moreLeancArgs := #["-O2"]
, makes v4
as fast as v5
.
Sebastian Ullrich (Jul 10 2022 at 20:47):
Did you compare against a straight FloatArray
?
Tomas Skrivan (Jul 10 2022 at 20:58):
Good point, I didn't.
Tomas Skrivan (Jul 10 2022 at 21:02):
This implementation:
def addOne_v0 (xs : FloatArray) : FloatArray := Id.run do
let mut xs := xs
for i in [0:xs.size] do
let xi := xs[i]'sorry
xs := xs.set ⟨i,sorry⟩ (xs[i]'sorry+1)
xs
takes rounghly 140ms.
Sebastian Ullrich (Jul 10 2022 at 21:03):
What happens if you replace xs.size
by 2
?
Tomas Skrivan (Jul 10 2022 at 21:04):
Nothing, more or less the same time.
Tomas Skrivan (Jul 10 2022 at 21:14):
Unrolling the loop gives 41ms
def addOne_v0 (xs : FloatArray) : FloatArray := Id.run do
let mut xs := xs
xs := xs.uset 0 (xs.uget 0 sorry+1) sorry
xs := xs.uset 1 (xs.uget 1 sorry+1) sorry
xs
Tomas Skrivan (Jul 10 2022 at 21:14):
Looks like that using a specialized structure for small n
might not be necessary.
Tomas Skrivan (Jul 10 2022 at 21:38):
Running equivalent program in C++ takes 9ms. Also in Lean, running just and empty loop with 10mil iterations takes 26ms.
Tomas Skrivan (Jul 10 2022 at 21:43):
It looks like that addOne_v5
is hitting Lean limit, it takes 34ms that is roughly 35ms = 9ms(c++) + 26ms(empty Lean for loop).
Mac (Jul 10 2022 at 21:44):
@Tomas Skrivan what if you use BaseIO
instead of IO
?
Tomas Skrivan (Jul 10 2022 at 21:45):
Not use what are you suggesting? Here and how should I use BaseIO
?
Mac (Jul 10 2022 at 21:46):
I mean use BaseIO
as the monad to run these experiments rather than IO
.
Tomas Skrivan (Jul 10 2022 at 21:47):
Still not clear to me. Are you suggesting def main : BaseIO Unit
? Then I do not have timeit
function.
Mac (Jul 10 2022 at 21:48):
Ah, oops, yeah. I forgot timeit
is in IO
-- that may be a problem. This means that Lean has to check if the loop errored every iteration.
Tomas Skrivan (Jul 10 2022 at 21:52):
Ahh, I see now! That makes perfect sense.
timeit "nothing" $ pure (Id.run do
let mut v : FloatArray := #[1,2].toFloatArray
for i in [0:10000000] do
v := v)
now takes 0.000254ms instead of 26ms
Mac (Jul 10 2022 at 21:52):
:)
Tomas Skrivan (Jul 10 2022 at 21:55):
Ahh, not correct. I think the loop gets eliminated, instead of pure $ Id.run
it should be for example IO.println $ Id.run
and the loop should return v
Tomas Skrivan (Jul 10 2022 at 21:55):
Still significantly faster, 0.018ms.
Mac (Jul 10 2022 at 21:57):
I think it still may be precomputing the loop though.
Tomas Skrivan (Jul 10 2022 at 22:00):
Yeah, the numbers are susspicious.
Tomas Skrivan (Jul 10 2022 at 22:03):
It is definitely precomputing the loop. The numbers are nonsensical now.
Mac (Jul 10 2022 at 22:05):
(deleted -- I am an idiot -- was not returning v
)
Mac (Jul 10 2022 at 22:10):
This test works (and is producing the expected code), but the time it reports does not seem like the time it actually takes in the editor:
set_option trace.compiler.ir.result true
opaque opaqueId (a : FloatArray) : FloatArray := a
def test_pure :=
timeit "pure" do
IO.println $ Id.run do
let mut v : FloatArray := ⟨#[1,2]⟩
for _ in [0:10000000] do
v := opaqueId v
return v
#eval test_pure
/-
[1.000000, 2.000000]
pure 0.102ms
-/
Tomas Skrivan (Jul 10 2022 at 22:17):
That is really odd, the time does not match but the loop is not precomputed.
Mac (Jul 10 2022 at 22:25):
It seems to me like some post-IR optimization is eliminating the loop.
Mac (Jul 10 2022 at 22:27):
However, I was not aware such an optimization stage existed, which confuses me greatly.
Mac (Jul 10 2022 at 22:28):
Actually, wait, the problem is that it is lifting the loop out of the timeit
.
Tomas Skrivan (Jul 10 2022 at 22:31):
The variable x_6
already holds the computed value, right? So the timeit ◾ x_8 x_7 x_1
just unpacks the value from x_7
.
Mac (Jul 10 2022 at 22:31):
Fixed:
set_option trace.compiler.ir.result true
opaque opaqueId (a : FloatArray) : FloatArray := a
def compute_pure : IO FloatArray := pure $ Id.run do
let mut v : FloatArray := ⟨#[0]⟩
for _ in [0:10000000] do
v := opaqueId v
return v
/-
[result]
def Std.Range.forIn.loop._at.compute_pure._spec_1 (x_1 : obj) (x_2 : obj) (x_3 : @& obj) (x_4 : @& obj) (x_5 : @& obj) : obj :=
let x_6 : u8 := Nat.decLe x_3 x_2;
case x_6 : u8 of
Bool.false →
let x_7 : obj := 0;
let x_8 : u8 := Nat.decEq x_1 x_7;
case x_8 : u8 of
Bool.false →
let x_9 : obj := 1;
let x_10 : obj := Nat.sub x_1 x_9;
dec x_1;
let x_11 : obj := Nat.add x_2 x_4;
dec x_2;
let x_12 : obj := Std.Range.forIn.loop._at.compute_pure._spec_1 x_10 x_11 x_3 x_4 x_5;
ret x_12
Bool.true →
dec x_2;
dec x_1;
inc x_5;
ret x_5
Bool.true →
dec x_2;
dec x_1;
inc x_5;
ret x_5
def compute_pure._closed_1 : float :=
let x_1 : obj := 0;
let x_2 : u8 := 0;
let x_3 : float := Float.ofScientific x_1 x_2 x_1;
ret x_3
def compute_pure._closed_2 : obj :=
let x_1 : obj := 1;
let x_2 : obj := Array.mkEmpty ◾ x_1;
ret x_2
def compute_pure._closed_3._boxed_const_1 : obj :=
let x_1 : float := compute_pure._closed_1;
let x_2 : obj := box x_1;
ret x_2
def compute_pure._closed_3 : obj :=
let x_1 : obj := compute_pure._closed_2;
let x_2 : obj := compute_pure._closed_3._boxed_const_1;
let x_3 : obj := Array.push ◾ x_1 x_2;
ret x_3
def compute_pure._closed_4 : obj :=
let x_1 : obj := compute_pure._closed_3;
let x_2 : obj := FloatArray.mk x_1;
ret x_2
def compute_pure (x_1 : obj) : obj :=
let x_2 : obj := 10000000;
let x_3 : obj := 0;
let x_4 : obj := 1;
let x_5 : obj := compute_pure._closed_4;
let x_6 : obj := Std.Range.forIn.loop._at.compute_pure._spec_1 x_2 x_3 x_2 x_4 x_5;
dec x_5;
let x_7 : obj := ctor_0[EStateM.Result.ok] x_6 x_1;
ret x_7
def Std.Range.forIn.loop._at.compute_pure._spec_1._boxed (x_1 : obj) (x_2 : obj) (x_3 : obj) (x_4 : obj) (x_5 : obj) : obj :=
let x_6 : obj := Std.Range.forIn.loop._at.compute_pure._spec_1 x_1 x_2 x_3 x_4 x_5;
dec x_5;
dec x_4;
dec x_3;
ret x_6
-/
def test_pure :=
timeit "pure" compute_pure
/-
[result]
def test_pure._closed_1 : obj :=
let x_1 : obj := "pure";
ret x_1
def test_pure._closed_2 : obj :=
let x_1 : obj := pap compute_pure;
ret x_1
def test_pure (x_1 : obj) : obj :=
let x_2 : obj := test_pure._closed_1;
let x_3 : obj := test_pure._closed_2;
let x_4 : obj := timeit ◾ x_2 x_3 x_1;
dec x_2;
ret x_4
-/
set_option trace.compiler.ir.result false
#eval test_pure
/-
pure 4.56s
[0.000000]
-/
Mac (Jul 10 2022 at 22:33):
@Tomas Skrivan if you run the above test from an executable you should get a more accurate result.
Tomas Skrivan (Jul 10 2022 at 22:34):
It looks like that the opaqueId
is no longer necessary.
Tomas Skrivan (Jul 10 2022 at 22:48):
Hmm interesting, compute_pure
has to really be a definition, otherwise it will get precomputed.
Also, the pure $ Id.run
seems to make it a little bit slower, 193ms vs 171ms.
Last updated: Dec 20 2023 at 11:08 UTC