Aktív témák

  • P.H.

    senior tag

    Skeleton of code adjusted to Zen(1) and Zen+ (i. e. placeholder codes omitted).

    Runs constantly at 3.4 IPC (of maximum 4.0 due to the 4 available ALU-s) for 50x50 matrices, this means 85% utilization.

    Since more than 90% percent of instructions need ALU, so important theorem is the way of instruction distribution among the ALUs. This is true for all other microarchitectures as well.

    { } movsx ebx,byte ptr [eax+E.FIELD0+00h]
    { } mov esi,ebp
    { } xor edx,edx
    { } and esi,-8
    { } mov [edi+__A],ebp
    { } mov ecx,ebp
    @init:
    { } mov [edi+esi*08h+(00h*08h)+__B],edx
    { } mov [edi+esi*08h+(04h*08h)+__B],edx
    { } sub esi,-8
    { } jnz @init
    @@a:
    { } cmp ebx,ebp
    { } lea esi,[ebp+ebx*04h]
    { } movsx ebx,byte ptr [eax+E.FIELD0+01h]
    { } lea eax,[eax+01h]
    { } mov [edi+esi*08h+__B],ebp
    { } cmova esi,edx
    { } mov [edi+ecx*08h+__C],esi
    { } add ecx,04h
    { } jnz @@a
    { } mov eax,[edi+ebp*08h+__C]
    { } mov ebx,ebp
    { } mov edx,edi
    { } jmp @b
    @@ROWS:
    { } mov eax,[edi+ebx*08h+(04h*08h)+__C]
    { } add ebx,04h
    { } jz @@COLS
    @next0b:
    { } mov [edi+ebx*08h+__D],eax
    { } sub edx,ebp
    { } mov [edi+ebx*08h+R],eax
    { } add eax,ebp
    { } jc @@ROWS
    { } lea ecx,[ebp+04h]
    { } mov esi,[edx+ebp]
    { } or esi,[edi+ebp*08h+__B]
    { } and ecx,-8
    { } mov ebp,[edi+ecx*08h+__B]
    @findr:
    { } or ebp,[edx+ecx+00h]
    { } cmp ebp,eax
    { } cmovb eax,ebp
    { } mov ebp,[edx+ecx+04h]
    { } or ebp,[edi+ecx*08h+(04h*08h)+__B]
    { } cmp ebp,esi
    { } cmovb esi,ebp
    { } mov ebp,[edi+ecx*08h+(08h*08h)+__B]
    { } add ecx,08h
    { } jnz @findr
    { } cmp eax,esi
    { } mov ebp,[edi+__A]
    { } cmovb esi,eax
    { } neg esi
    { } mov [edi+__L+ebx*08h+__F],esi
    { } jle @ROWS
    @@EXIT:
    { } mov esi,[esp+_X]
    { } mov [esi+E.O],7FFFFFFFh
    { } jmp @outside
    @free0col:
    { } add [edi+__0],ebp
    { } mov [edi+__Y],esi
    { } mov ecx,0FFFFFFFFh
    @@COLS:
    { } mov [edi+ebp*08h-(04h*08h)+__B],ebx
    @mark:
    { } mov [edi+__L+ebx*08h+__CC],esi
    { } mov esi,[edi+__L+ebp*08h+__F]
    { } mov edx,ebp
    @nextc:
    { } lea eax,[ebx-04h]
    { } mov [edi+__L+ebx*08h+__M],ecx
    { } mov ecx,[edi+ebx*08h-(04h*08h)+__B]
    { } mov ebx,eax
    { } sub eax,ebp
    { } jc @@init0
    { } xor ecx,-1
    { } jns @nextc
    @findc:
    { } add esi,[eax+edi]
    { } or esi,[edi+edx*08h+__D]
    { } lea edx,[edx+04h]
    { } jz @testr
    { } sub eax,ebp
    { } cmp esi,ecx
    { } cmovb ecx,esi
    { } mov esi,[edi+__L+edx*08h+__F]
    { **} cmp edx,00h
    { **} jnz @findc
    { } mov eax,ebx
    { } mov edx,ebp
    { } mov esi,ecx
    { **} cmp ecx,00h
    { **} js @@EXIT
    @seek0:
    { } mov ecx,[edi+__L+edx*08h+__F]
    { } sub eax,ebp
    { } add edx,04h
    { } jg @free0col
    { } add ecx,[eax+edi]
    { **} cmp ecx,esi
    { **} jnz @seek0
    @testr:
    { } lea ecx,[edx-04h]
    { **} test [edi+edx*08h-(04h*08h)+__C],ebx
    { **} js @seek0col
    { } mov [edi+ebx*08h+__B],ecx
    { } mov [edi+edx*08h-(04h*08h)+__C],ebx
    { } xor ecx,ecx
    { } jmp @mark
    @@init0:
    { } mov [edi+__I],ecx
    { } mov eax,ecx
    { } sal ecx,10h
    { } jnz @scan
    { } mov eax,edi
    { } mov esi,[esp+_X]
    { } jmp outside
    @@1ST_STEP:
    { } movsx esi,ax
    { } mov esi,[edi+esi*08h+__C]
    { } neg edx
    @1ST_STEP:
    { } movsx ecx,byte ptr [edi+__L+ebx*08h+__S+__M]
    { } and ecx,edx
    { } sub [edi+__L+ebx*08h+__CC],ecx
    { } movsx ecx,byte ptr [edi+ebx*08h+__S+R]
    { } and ecx,edx
    { } sub [edi+__L+ebx*08h+__F],ecx
    { } add ebx,04h
    { } jnz @1ST_STEP
    { } mov ecx,[edi+__I]
    { } movsx ebx,ax
    { } sar eax,10h
    { **} test esi,esi
    { **} jz @@2ND_STEP
    { } add dword ptr [edi+__A],-1
    { } mov [edi+ebx*08h+R],eax
    { } mov [edi+__L+esi*08h+__M],ebx
    { } cmp esi,ecx
    { } cmovb ecx,esi
    { } mov ebx,ebp
    { } mov [edi+__I],ecx
    { } jmp @@6TH_STEP
    @pass:
    { } mov eax,ecx
    { } sar ecx,10h
    { } cmovnc eax,[edi+__W]
    { } mov [edi+__W],eax
    { } lea ebx,[ebp+00h]
    { } mov [edi+__L+ecx*08h+__M],esi
    @nx:
    { } mov esi,[edi+__L+ecx*08h+__M +(04h*08h)]
    { } add ecx,04h
    { } jz @@1ST_STEP
    { **} cmp esi,[edi+__A]
    { **} jbe @nx
    @@6TH_STEP:
    { } mov esi,[edi+__L+ecx*08h+__CC]
    { } lea ebx,[ebp-04h]
    { } { x1 } nop
    { } mov eax,ecx
    { } mov [edi+__Y],esi
    @ffd:
    { } sal ecx,10h
    { } mov esi,[edi+__L+ebx*08h+(04h*08h)+__F]
    @z:
    { **} cmp ebx,-4
    { **} jz @pass
    @scan:
    { } add ebx,04h
    { } sub eax,ebp
    { } sub esi,[edi+__Y]
    { } add esi,[edi+eax]
    { } or esi,[edi+ebx*08h+R]
    { } jz @@5TH_STEP
    { } cmp esi,edx
    { } cmovb cx,bx
    { } cmovb edx,esi
    { } mov esi,[edi+__L+ebx*08h+(04h*08h)+__F]
    { **} cmp ebx,-4
    { **} jz @pass
    { } add ebx,04h
    { } sub eax,ebp
    { } sub esi,[edi+__Y]
    { } add esi,[edi+eax]
    { } or esi,[edi+ebx*08h+R]
    { } jz @@5TH_STEP
    { } cmp esi,edx
    { } cmovb cx,bx
    { } cmovb edx,esi
    { } mov esi,[edi+__L+ebx*08h+(04h*08h)+__F]
    { } db $66,$66,$66; nop
    { } jmp @zZ
    @@5TH_STEP:
    { } mov esi,ecx
    { } sar ecx,10h
    { } cmovnc esi,[edi+__W]
    { } mov [edi+__W],esi
    { } mov esi,[edi+ebx*08h+__C]
    { **} test esi,esi
    { **} jz @2ND_STEP
    { } mov [edi+ebx*08h+R],ecx
    { } mov [edi+__L+esi*08h+__M],0FFFFFFFFh
    { **} cmp word ptr [edi+__W],bx
    { **} jz @re
    { **} cmp esi,ecx
    { **} jae @ffd
    { } mov ecx,esi
    { } mov eax,[edi+__I]
    { } cmp esi,eax
    { } cmovb eax,esi
    { } mov [edi+__I],eax
    { } jmp @@6TH_STEP
    @re:
    { } mov ecx,[edi+__I]
    { } add dword ptr [edi+__A],-1
    { } mov edx,esi
    { } cmp esi,ecx
    { } cmovb ecx,esi
    { } mov [edi+__I],ecx
    { } jmp @@6TH_STEP
    @@2ND_STEP:
    { } mov ecx,eax
    @2ND_STEP:
    { } mov [edi+ebx*08h+__C],ecx
    { } mov edx,[edi+ecx*08h+__B]
    { } mov [edi+ecx*08h+__B],ebx
    { } mov ecx,[edi+edx*08h+R]
    { } mov ebx,edx
    { } add edx,ebp
    { } jc @2ND_STEP
    { } mov ecx,esi
    { } sub [edi+__0],ebp
    { } jz @@outside
    @@9ST_STEP:
    { } mov eax,[edi+esi*08h-(04h*08h)+__B]
    { } xor eax,-1
    { } mov [edi+__L+esi*08h-(04h*08h)+__M],eax
    { } lea esi,[esi-04h]
    { } cmovs ecx,esi
    { } mov ebx,[edi+esi*08h+__D]
    { } mov eax,[edi+esi*08h-(04h*08h)+__B]
    { } mov [edi+esi*08h+R],ebx
    { **} cmp ebp,esi
    { **} jz @i9
    { } xor eax,-1
    { } mov [edi+__L+esi*08h-(04h*08h)+__M],eax
    { } lea esi,[esi-04h]
    { } cmovs ecx,esi
    { } mov eax,[edi+esi*08h+__D]
    { } mov [edi+esi*08h+R],eax
    { **} cmp ebp,esi
    { **} jnz @@9ST_STEP
    @i9:
    { } mov [edi+__I],ecx
    {- } jmp @@6TH_STEP

    [ Szerkesztve ]

    Arguing on the Internet is like running in the Special Olympics. Even if you win, you are still ... ˙˙˙ Real Eyes Realize Real Lies ˙˙˙

Aktív témák