Aktív témák

  • P.H.

    senior tag

    válasz P.H. #101 üzenetére

    Eheti 1000-mátrix-hetente rovat :)

    Core2 (2.5 GHz): 60 sec alatt 510000 mátrix

    {@04-} { x1 } movsx ebx,byte ptr es:[edx]
    {1-} xor eax,eax
    {2-} mov esi,ebp
    {0} and esi,-8
    @init:
    {@0F} mov [edi+esi*08h+(00h*08h)+__0STARROW],eax
    {1} mov [edi+esi*08h+(04h*08h)+__0STARROW],eax
    {2*} add esi,08h
    {0*} jnz @init { clears ESI register }
    { } add edx,01h
    { -} mov ecx,ebp
    @@ARGUMENT: { K10:2.6 Core2:2.9 - 3.3 uop/clk - 1640*2+6550 }
    {@20} cmp ebx,esi { 4 AGU + 9 EX uops on Kaveri }
    {1} lea eax,[ebp+ebx*04h+00h] { 3 clk 8 ALU ops on Core 2 }
    {2} movsx ebx,[edx]
    {0} lea edx,[edx+01h]
    {1} mov [edi+eax*08h+__0STARROW],ebp { __0COUNTER <- EBP }
    {2} cmovs eax,esi
    {0} mov [edi+ecx*08h+__FIXEDROW],eax
    {1*} add ecx,04h
    {2*} jnz @@ARGUMENT { clears ECX register }
    { } add esp,ebp
    { -} mov eax,edi
    { -} push ebp
    {@40-} lea edx,[ebp-04h]
    @@REDUCE_ROWS:
    {@43} mov [edi+edx*08h+__ROWMODIFIER],ecx
    {1} mov esi,[edi+edx*08h+(04h*08h)+__FIXEDROW]
    {2*} add edx,04h
    {0*} jz @@REDUCE_COLUMNS
    {@50} mov [edi+edx*08h+__0STAR],esi
    {2-} xor ecx,ecx
    {0} sub eax,ebp
    {1**} test esi,esi { JS/JNS can only fuse with TEST }
    {2**} js @@REDUCE_ROWS
    { -} mov ebx,ebp { EBX < 0 for even minimum }
    { } mov ecx,[eax+ebp]
    {@61} or ecx,[edi+ebp*08h+__0STARROW]
    { } and ebp,04h
    { } add ebp,ebx
    {@69} @findrowmin: { K10:2.8 Core2:2.2 - 2.6 uop/clk - 1100*2+5000 }
    {0} mov esi,[eax+ebp+00h] { 4 AGU + 8 EX uops on Kaveri }
    {1} or esi,[edi+ebp*08h+(00h*08h)+__0STARROW] { 3 clk 10 ALU ops on Core 2 }
    {2} add ebp,08h
    {@72} cmp esi,ebx
    {1} cmovb ebx,esi
    {2} mov esi,[eax+ebp-04h]
    {0} or esi,[edi+ebp*08h-(04h*08h)+__0STARROW]
    {1} cmp esi,ecx
    {@81} cmovb ecx,esi
    {0**} test ebp,ebp
    {1**} jnz @findrowmin
    { } mov ebp,[esp+00h]
    { } cmp ebx,ecx
    { } cmovb ecx,ebx
    {@90} neg ecx
    { } jle @@REDUCE_ROWS
    @@ABNORMAL_EXIT:
    {@94} pop eax
    {1} sub esp,ebp
    {2} mov edx,0FFFFFFFFh
    {0} mov esi,[esp+__MARKS]
    {@A0} mov [esi+TRESULT.OPTIMUM],edx
    {2} mov ebx,[esi+TRESULT.NEXTIVALUE]
    {0} jmp dword ptr [esp+_INVALIDRESULT]
    { } { x6 } test ebp,0FFFFFFFFh
    {@90} @initcol:
    {0} neg dword ptr [esp+00h]
    {1-} mov esi,ebp
    {2} neg ebp
    {0} mov [edi+__INITCOL],ecx
    {1} or ebx,-1
    {2} jmp @@1ST_STEP { long jump instruction }
    {@A2} @free0col:
    { } lea ecx,[edx-04h]
    {@A5} @setcolmod:
    { } mov [edi+edx*08h+__COLMODIFIER],esi
    @@REDUCE_COLUMNS:
    {1**} jz @initcol
    {0} sub edx,04h
    {@B0-} xor esi,esi
    {1**} test [edi+edx*08h+__0STARROW],ebp
    {2**} js @setcolmod
    { } lea ebx,[edi+edx]
    { -} mov ecx,ebp
    { -} mov eax,ebp
    { } sub ebx,ebp
    {@C0} @findcolmin: { K10:3.0 Core2:_._ - _._ uop/clk - ____*2+____
    {0} mov esi,[ebx] { 3 AGU + 8 EX uops on Kaveri }
    {1} add esi,[edi+ecx*08h+__ROWMODIFIER] { 3 clk 9 ALU ops on Core 2 }
    {2} or esi,[edi+ecx*08h+__FIXEDROW]
    {0} jz @test0row
    {1} sub ebx,ebp
    {2} cmp esi,eax
    {@D0} cmovb eax,esi
    {1*} add ecx,04h
    {2*} jnz @findcolmin
    { } lea ecx,[ebp-04h]
    { -} mov esi,eax
    { } lea ebx,[edi+edx]
    {@E0**} test eax,eax { JS/JNS can only fuse with TEST }
    { **} js @@ABNORMAL_EXIT
    {@E4} @seekcol0:
    {0} mov eax,[edi+ecx*08h+(04h*08h)+__ROWMODIFIER]
    {1*} add ecx,04h
    {2*} jz @free0col
    {0} sub ebx,ebp
    {1} add eax,[ebx]
    {@F1**} cmp eax,esi { maximum data value = 00FFFFFFh -> marked elements stay negative }
    {0**} jnz @seekcol0
    @test0row:
    { **} test [edi+ecx*08h+__0STAR],ebp
    { **} js @seekcol0
    { } mov [edi+edx*08h+__0STARROW],ecx
    {@FE} mov [edi+ecx*08h+__0STAR],edx
    {@02} jns @free0col { forced conditional jump for Sandy Bridge }
    { ----------------------------------------------------------------------------------------------- }
    {@04} { x12 } mov eax,00000000h; mov edx,00000000h; xor ebp,ebp
    {@10} { x5 } mov ecx,00000000h
    @@5TH_STEP: { K10:2.6 Core2:2.4 - 2.8 uop/clk - 2000*2+5100
    {@15} mov eax,[edi+__INITCOL] { lea eax,[ebp+04h]; neg eax }
    {1} mov esi,[esp+__SIZE]
    {2} movsx ebx,word ptr [edi+__MINCOLROW]
    {@20} @DEC5_free_col: { 3 AGU + 6 EX uops on Kaveri }
    {0} add [edi+eax*08h+__COLMODIFIER],ecx { 2 clk 5 ALU ops on Core 2 }
    {1} mov ecx,[edi+eax*08h+(04h*08h)+__COLMARK]
    {2} sar ecx,1Fh
    {0} and ecx,edx
    {1*} add eax,04h
    {@30*} jnz @DEC5_free_col { clears EAX register [NOT USED] }
    { } mov eax,[esp+__SIZE+esi*04h]
    { } movsx ecx,word ptr [edi+__MINCOLROW+02h]
    { } jmp @INC5_marked_row
    { x4 } xor ebp,ebp; xor esi,esi
    {@40} @inc5row:
    {0} add [edi+eax*08h+__ROWMODIFIER],edx { 4 AGU + 4 EX uops on Kaveri }
    {1-} mov eax,ebp
    @INC5_marked_row:
    {2} mov ebp,[esp+esi*04h]
    {0*} sub esi,01h
    {1*} jge @inc5row { sets ESI to 0FFFFFFFFh }
    @@3RD_STEP:
    {@4E*} and esi,[edi+ebx*08h+__0STAR]
    {@52*} jz @4TH_STEP { long jump instruction }
    {@58} @re3start:
    { } mov [edi+ebx*08h+__0COLON___ROWMARK],ecx { set row mark }
    { } { x1 } mov ecx,es:[edi+__INITCOL] { lea ecx,es:[ebp-04h] }
    {@60-} mov edx,ebx
    {@62} @mark3row:
    { } mov [esp+__OFFS+eax*04h],ebx
    { -} xor ebx,ebx
    { } mov [edi+esi*08h+__COLMARK],esi { unmark column with negative }
    { } inc eax
    { } mov [esp+__SIZE],eax
    {@71} @chk2col:
    {0*} add ecx,04h
    {1*} jz @@5TH_STEP { clears ECX register }
    {2**} test [edi+ecx*08h+__COLMARK],ecx { STORE FORWARDED from @mark3row }
    {0**} jns @chk2col
    @@2ND_STEP:
    {12} push dword ptr [edi+ecx*08h+__COLMODIFIER]
    {@80} lea eax,[ecx+edi]
    { } sub ebx,ebp
    { } sal ecx,10h
    { } mov esi,[edi+ebx*08h+__ROWMODIFIER]
    {@8C} @ZERO2col: { K10:3.0 Core2:2.5 - 2.9 uop/clk - 1500*2+5600 { 4 AGU + 11 EX uops on Kaveri }
    {0} sub esi,[esp+00h] { 4 clk 13 ALU ops on Core 2 }
    {@8F} add esi,[eax+ebp]
    {C2D} lea eax,[eax+ebp] { Core 2, Kaveri }
    {2} jo @over2flow { overflow: (-x)+(-y)=(+z) or (+x)+(+y)=(-z) }
    {0} or esi,[edi+ebx*08h+__0COLON___ROWMARK]
    {1} jz @zero
    {K10}// lea eax,[eax+ebp] { K10, Sandy Bridge, Ivy Bridge }
    {0} cmp esi,edx
    {@9F} cmovb edx,esi
    {@A2} cmovb cx,bx
    @over2flow:
    {0} mov esi,[edi+ebx*08h+(04h*08h)+__ROWMODIFIER]
    {1*} add ebx,04h
    {2*} jnz @ZERO2col { clears EBX register }
    {@AF} @zero:
    {0} pop eax { add esp,04h } { forces ESP handling to AGU/memory pipe on Kaveri/Core }
    {@B0-} mov eax,ecx
    {2} sar ecx,10h
    {0} cmovnc eax,[edi+__MINCOLROW]
    {1} mov [edi+__MINCOLROW],eax
    {2**} test ebx,ebx
    {0**} jz @chk2col
    {@C0*} add esi,[edi+ebx*08h+__0STAR] { zero found -> ESI=0 }
    {2*} jz @4TH_STEP
    {0} cmp ax,bx
    {1} { x1 } mov eax,ss:[esp+__SIZE]
    {2} jz @re3start
    {@D0} cmp esi,ecx
    {1} mov [edi+ebx*08h+__0COLON___ROWMARK],ecx { set row mark }
    {2} cmovl ecx,esi
    {0*} sub ecx,04h { never clears ECX register }
    {1*} jnz @mark3row { forced conditional jump for Sandy Bridge }
    { x2 } xor esi,esi
    {@E0} { x4 } lea eax,[ebp+ebp+00h]
    @@4TH_STEP: { 5 AGU + 3 EX uops on Kaveri }
    {@E4-} mov ebx,edx { 2 clk 2 ALU ops on Core 2 }
    @4TH_STEP:
    {@E6} mov edx,[edi+ecx*08h+__0STARROW]
    {2} mov [edi+ebx*08h+__0STAR],ecx
    {0} mov [edi+ecx*08h+__0STARROW],ebx
    {@F0} mov ecx,[edi+edx*08h+__0COLON___ROWMARK]
    {2**} cmp edx,00h
    {0**} jnz @@4TH_STEP { clears EDX register }
    { } sub esi,ebp
    { } sub edx,ebp
    { } lea ecx,[esi-04h] { mov ecx,[edi+__INITCOL] }
    @@1ST_STEP: { K10:2.8 Core2:2.9 - 3.2 uop/clk - 1500*2+6100 }
    {@00} mov eax,[edi+esi*08h+__0STARROW] { 4 AGU + 7 EX uops on Kaveri }
    {1} and ebx,eax { 3 clk 6 ALU ops on Core 2 }
    {2} not eax
    {0} mov [edi+esi*08h+__COLMARK],eax
    {1} mov eax,[edi+esi*08h+__FIXEDROW]
    {2} cmovs ecx,esi
    {0} mov [edi+esi*08h+__0COLON___ROWMARK],eax
    {1*} add esi,04h
    {2*} jnz @@1ST_STEP { clears ESI register }
    { } mov [esp+__SIZE],esi
    { -} xor ebx,ebx
    {@21*} add ecx,04h { long jump instruction }
    { *} jnz @@2ND_STEP { ===>>> EBX: 00h EDX:negative ECX:initcol (>= EBP) }
    { } mov esi,[esp+ebp+04h+__MARKS]
    { -} mov ebx,edi { work matrix unmodified } { [esp+__SAVE] }
    @@results:
    {@30} mov eax,[edi+edx*08h+__0STAR] { 3 AGU + 8 EX uops on Kaveri }
    {1} add ebx,ebp
    {2} add ecx,[ebx+eax]
    {0} add eax,ebp
    {1} shr eax,02h
    {2} mov [esi],al
    {@40} add esi,01h
    {1*} add edx,04h
    {2*} jnz @@results { clears EDX register ( DL=0 as head, DH=0 as length ) }
    {0} pop eax
    {1} add esp,ebp
    {2} neg ebp
    {0} or eax,-1
    {@50} lea ebx,[edi+ebp*04h]
    {1} sar ebp,02h
    {2} mov [esi+ebp+TRESULT.OPTIMUM],ecx
    {0} add esi,ebp
    {1-} xor ecx,ecx
    {2} jmp @onchain

    [ Szerkesztve ]

    Arguing on the Internet is like running in the Special Olympics. Even if you win, you are still ... ˙˙˙ Real Eyes Realize Real Lies ˙˙˙

Aktív témák