Keresés

Aktív témák

  • P.H.

    senior tag

    válasz P.H. #102 üzenetére

    Core2 (2.5 GHz): 59 sec alatt megoldja a feladatot
    K10 (2.9 GHz): Core2-nek tetsző ciklusverzióval 46 sec oldja meg a feladatot
    Prescott (2.26 GHz): 60 sec alatt 220000 mátrix

    {@04-} { x1 } movsx ebx,byte ptr es:[edx]
    {1-} xor eax,eax
    {2-} mov esi,ebp
    {0} and esi,-8
    @init:
    {@0F} mov [edi+esi*08h+(00h*08h)+__0STARROW],eax
    {1} mov [edi+esi*08h+(04h*08h)+__0STARROW],eax
    {2*} add esi,08h
    {0*} jnz @init { clears ESI register }
    { } add edx,01h
    { -} mov ecx,ebp
    @@ARGUMENT: { K10:2.6 Core2:2.9 - 3.3 uop/clk - 1640*2+6550 }
    {@20} cmp ebx,esi { 4 AGU + 9 EX uops on Kaveri }
    {1} lea eax,[ebp+ebx*04h+00h] { 3 clk 8 ALU ops on Core 2 }
    {2} movsx ebx,[edx]
    {0} lea edx,[edx+01h]
    {1} mov [edi+eax*08h+__0STARROW],ebp { __0COUNTER <- EBP }
    {2} cmovs eax,esi
    {0} mov [edi+ecx*08h+__FIXEDROW],eax
    {1*} add ecx,04h
    {2*} jnz @@ARGUMENT { clears ECX register }
    { -} { x2 } xor ecx,ecx
    { -} mov eax,edi
    { -} push ebp
    {@40-} lea edx,[ebp-04h]
    @@REDUCE_ROWS:
    {@43} mov [edi+edx*08h+__ROWMODIFIER],ecx
    {1} mov esi,[edi+edx*08h+(04h*08h)+__FIXEDROW]
    {2*} add edx,04h
    {0*} jz @@REDUCE_COLUMNS
    {@50} mov [edi+edx*08h+__0STAR],esi
    {2-} xor ecx,ecx
    {0} sub eax,ebp
    {1**} test esi,esi { JS/JNS can only fuse with TEST }
    {2**} js @@REDUCE_ROWS
    { -} mov ebx,ebp { EBX < 0 for even minimum }
    { } mov ecx,[eax+ebp]
    {@61} or ecx,[edi+ebp*08h+__0STARROW]
    { } and ebp,04h
    { } add ebp,ebx
    {@69} @findrowmin: { K10:2.8 Core2:2.2 - 2.6 uop/clk - 1100*2+5000 }
    {0} mov esi,[eax+ebp+00h] { 4 AGU + 8 EX uops on Kaveri }
    {1} or esi,[edi+ebp*08h+(00h*08h)+__0STARROW] { 3 clk 10 ALU ops on Core 2 }
    {2} add ebp,08h
    {@72} cmp esi,ebx
    {1} cmovb ebx,esi
    {2} mov esi,[eax+ebp-04h]
    {0} or esi,[edi+ebp*08h-(04h*08h)+__0STARROW]
    {1} cmp esi,ecx
    {@81} cmovb ecx,esi
    {0**} test ebp,ebp
    {1**} jnz @findrowmin
    { } mov ebp,[esp+00h]
    { } cmp ebx,ecx
    { } cmovb ecx,ebx
    {@90} neg ecx
    { } jle @@REDUCE_ROWS
    { -} nop
    @@ABNORMAL_EXIT:
    {@95} pop eax
    {1} or edx,-1
    {2} mov esi,[esp+__MARKS]
    {0} mov [esi+TRESULT.OPTIMUM],edx
    {@A0} mov ebx,[esi+TRESULT.NEXTIVALUE]
    {2} jmp dword ptr [esp+_INVALIDRESULT]
    { } { x6 } test ebp,0FFFFFFFFh
    {@AD} @init0col:
    {0} mov [edi+__INITCOL],ecx
    {@B0-} mov esi,ebp
    {2} neg ebp
    {0} or ebx,-1
    {1*} sub ecx,04h
    {2*} jnz @@1ST_STEP { long jump instruction } { forced conditional jump for Sandy Bridge }
    {@C0} { x3 } cmp ebp,00h
    {@C3} @free0col:
    { -} mov ecx,edx
    {@C5} @setcolmod:
    { } mov [edi+edx*08h+__COLMODIFIER],esi
    @@REDUCE_COLUMNS: { no need to initialize -initcol in ECX }
    {0**} cmp edx,ebp
    {1**} jz @init0col
    {0} sub edx,04h
    {@D0-} xor esi,esi
    {1**} test [edi+edx*08h+__0STARROW],ebp
    {2**} js @setcolmod
    { } lea ebx,[edi+edx]
    { -} mov ecx,ebp
    { -} mov eax,ebp
    { } sub ebx,ebp
    {@E0} @findcolmin: { K10:3.0 Core2:_._ - _._ uop/clk - ____*2+____
    {0} mov esi,[ebx] { 3 AGU + 8 EX uops on Kaveri }
    {1} add esi,[edi+ecx*08h+__ROWMODIFIER] { 3 clk 9 ALU ops on Core 2 }
    {2} or esi,[edi+ecx*08h+__FIXEDROW]
    {0} jz @test0row
    {1} sub ebx,ebp
    {2} cmp esi,eax
    {@F0} cmovb eax,esi
    {1*} add ecx,04h
    {2*} jnz @findcolmin
    { } lea ecx,[ebp-04h]
    { -} mov esi,eax
    { } lea ebx,[edi+edx]
    {@00**} test eax,eax { JS/JNS can only fuse with TEST }
    { **} js @@ABNORMAL_EXIT
    {@04} @seekcol0:
    {0} mov eax,[edi+ecx*08h+(04h*08h)+__ROWMODIFIER]
    {1*} add ecx,04h
    {2*} jz @free0col
    {0} sub ebx,ebp
    {1} add eax,[ebx]
    {@11**} cmp eax,esi { maximum data value = 00FFFFFFh -> marked elements stay negative }
    {0**} jnz @seekcol0
    @test0row:
    { **} test [edi+ecx*08h+__0STAR],ebp
    { **} js @seekcol0
    { } mov [edi+edx*08h+__0STARROW],ecx
    {@1E} mov [edi+ecx*08h+__0STAR],edx
    {@22} jns @free0col { forced conditional jump for Sandy Bridge }
    { ----------------------------------------------------------------------------------------------- }
    {@24} { x12 } test ebp,0FFFFFFFFh; test edi,0FFFFFFFFh
    {@30} { x9 } mov ecx,00000000h; xor esi,esi; xor edi,edi
    @@5TH_STEP: { K10:2.6 Core2:_._ - _._ uop/clk - ____*2+____
    {@39} mov ecx,[edi+__MINCOLROW]
    { } sub ebx,ebp
    { } neg edx
    {@40} @DEC5_free_col: { 5 AGU + 11 EX uops on Kaveri }
    {0} mov eax,[edi+ebx*08h+__COLMARK] { 3 clk 8 ALU ops on Core 2 }
    {1} sar eax,1Fh
    {2} mov [edi+ebx*08h+__COLMARK],eax
    {0} and eax,edx
    {1} sub [edi+ebx*08h+__COLMODIFIER],eax
    {@51} mov eax,[edi+ebx*08h+__0COLON___ROWMARK]
    {0} sar eax,1Fh
    {1} and eax,edx
    {2} sub [edi+ebx*08h+__ROWMODIFIER],eax
    {0*} add ebx,04h
    {@61*} jnz @DEC5_free_col { clears EBX register [NOT USED] }
    {@63} movsx ebx,cx
    {1} sar ecx,10h
    {2} mov esi,[edi+ebx*08h+__0STAR]
    {0**} cmp esi,00h
    {@70**} jz @4TH_STEP { long jump instruction }
    {2} mov [edi+ebx*08h+__0COLON___ROWMARK],ecx { set row mark }
    {0} mov dword ptr [edi+esi*08h+__COLMARK],0FFFFFFFFh { unmark column with -1 }
    {1} mov esi,[edi+__INITCOL]
    {@85} @mark3row:
    { -} xor ebx,ebx
    { } lea ecx,[esi-04h]
    { } jmp @chk2col
    @pass2col:
    { } mov [edi+ecx*08h+__COLMARK],ecx { re-mark column with column index <> -1 }
    {@90} @chk2col:
    {0*} add ecx,04h
    {1*} jz @@5TH_STEP { clears ECX register }
    {2**} cmp [edi+ecx*08h+__COLMARK],ecx
    {0**} jbe @chk2col
    @@2ND_STEP:
    { } lea eax,[ecx+edi]
    { } sub ebx,ebp
    @continue:
    {@A0} { x1 } push dword ptr es:[edi+ecx*08h+__COLMODIFIER]
    { } sal ecx,10h
    { } mov esi,[edi+ebx*08h+__ROWMODIFIER]
    {@AC} @ZERO2col: { K10:3.0 Core2:2.5 - 2.9 uop/clk - 1500*2+5600 { 4 AGU + 11 EX uops on Kaveri }
    {0} sub esi,[esp+00h] { 4 clk 13 ALU ops on Core 2 }
    {@AF} add esi,[eax+ebp]
    {C2D} lea eax,[eax+ebp]
    {2} jo @over2flow { overflow: (-x)+(-y)=(+z) or (+x)+(+y)=(-z) }
    {0} or esi,[edi+ebx*08h+__0COLON___ROWMARK]
    {1} jz @@3RD_STEP
    {K10}// lea eax,[eax+ebp]
    {0} cmp esi,edx
    {@BF} cmovb edx,esi
    {@C2} cmovb cx,bx
    @over2flow:
    {0} mov esi,[edi+ebx*08h+(04h*08h)+__ROWMODIFIER]
    {1*} add ebx,04h
    {2*} jnz @ZERO2col { clears EBX register }
    @@3RD_STEP:
    {@CF} pop esi { add esp,04h } { enforces ESP handling to AGU/load pipe on Kaveri/Core }
    {@D0-} mov esi,ecx
    {2} sar ecx,10h
    {0} cmovnc esi,[edi+__MINCOLROW]
    {1} mov [edi+__MINCOLROW],esi
    {2**} test ebx,ebx
    {0**} jz @pass2col
    {@E0} mov esi,[edi+ebx*08h+__0STAR]
    {2**} test esi,esi
    {0**} jz @4TH_STEP
    {1} mov [edi+ebx*08h+__0COLON___ROWMARK],ecx { set row mark }
    {2} or dword ptr [edi+esi*08h+__COLMARK],-1 { unmark column with -1 }
    {@F1**} cmp word ptr [edi+__MINCOLROW],bx
    {1**} jz @re2start
    {2**} cmp esi,ecx { jb = jl for 2 negative numbers }
    {0**} jb @mark3row
    {1*} add ebx,04h
    {2*} jnz @continue
    {@00} jmp @pass2col
    {1} { x2 } xor eax,eax
    {@04} @re2start:
    {0} mov ecx,[edi+__INITCOL]
    {1-} mov ebx,ebp
    {2} neg ebx
    @initcol:
    {0} sar dword ptr [edi+ebx*08h+__COLMARK],1Fh
    {@10*} add ebx,04h
    {2*} jnz @initcol { clears EBX register }
    { } or edx,-1
    { *} sub ecx,04h
    { *} jnz @chk2col { long jump instruction }
    {@20} { x4 } lea eax,[ebp+ebp+00h]
    @@4TH_STEP: { 5 AGU + 3 EX uops on Kaveri }
    {@24-} mov ebx,edx { 2 clk 2 ALU ops on Core 2 }
    @4TH_STEP:
    {@26} mov edx,[edi+ecx*08h+__0STARROW]
    {2} mov [edi+ebx*08h+__0STAR],ecx
    {0} mov [edi+ecx*08h+__0STARROW],ebx
    {@30} mov ecx,[edi+edx*08h+__0COLON___ROWMARK]
    {2**} cmp edx,00h
    {0**} jnz @@4TH_STEP { clears EDX register }
    { } sub esi,ebp
    { } sub edx,ebp
    { } lea ecx,[esi-04h]
    @@1ST_STEP: { K10:2.8 Core2:2.9 - 3.2 uop/clk - 1500*2+6100 }
    {@40} mov eax,[edi+esi*08h+__0STARROW] { 4 AGU + 7 EX uops on Kaveri }
    {1} and ebx,eax { 3 clk 6 ALU ops on Core 2 }
    {2} not eax
    {0} mov [edi+esi*08h+__COLMARK],eax
    {1} mov eax,[edi+esi*08h+__FIXEDROW]
    {2} cmovs ecx,esi
    {0} mov [edi+esi*08h+__0COLON___ROWMARK],eax
    {1*} add esi,04h
    {2*} jnz @@1ST_STEP { clears ESI register }
    { } { x3 } mov ebx,00000000h
    {@60*} add ecx,04h { long jump instruction }
    { *} jnz @@2ND_STEP { ===>>> EBX: 00h EDX:negative = -EBP ECX:initcol (>= EBP) }
    { } { x1 } mov esi,ss:[esp+04h+__MARKS]
    { -} mov ebx,edi { work matrix unmodified } { [esp+__SAVE] }
    @@results:
    {@70} mov eax,[edi+edx*08h+__0STAR] { 3 AGU + 8 EX uops on Kaveri }
    {1} add ebx,ebp
    {2} add ecx,[ebx+eax]
    {0} add eax,ebp
    {1} shr eax,02h
    {2} mov [esi],al
    {@80} add esi,01h
    {1*} add edx,04h
    {2*} jnz @@results { clears EDX register ( DL=0 as head, DH=0 as length ) }

    [ Szerkesztve ]

    Arguing on the Internet is like running in the Special Olympics. Even if you win, you are still ... ˙˙˙ Real Eyes Realize Real Lies ˙˙˙

Aktív témák