• <ins id="pjuwb"></ins>
    <blockquote id="pjuwb"><pre id="pjuwb"></pre></blockquote>
    <noscript id="pjuwb"></noscript>
          <sup id="pjuwb"><pre id="pjuwb"></pre></sup>
            <dd id="pjuwb"></dd>
            <abbr id="pjuwb"></abbr>

            醬壇子

            專注C++技術(shù) 在這里寫下自己的學(xué)習(xí)心得 感悟 和大家討論 共同進(jìn)步(歡迎批評(píng)!!!)

              C++博客 :: 首頁 :: 聯(lián)系 :: 聚合  :: 管理
              66 Posts :: 16 Stories :: 236 Comments :: 0 Trackbacks

            公告

            王一偉 湖南商學(xué)院畢業(yè) 電子信息工程專業(yè)

            常用鏈接

            留言簿(19)

            我參與的團(tuán)隊(duì)

            搜索

            •  

            積分與排名

            • 積分 - 387045
            • 排名 - 64

            最新隨筆

            最新評(píng)論

            閱讀排行榜

            評(píng)論排行榜

            void?*?memcpy_amd(void?*dest,?const?void?*src,?size_t?n)
            {
            ??__asm?{

            mov ecx,?[n] ;?number?of?bytes?to?copy
            mov edi,?[dest] ;?destination
            mov esi,?[src] ;?source
            mov ebx,?ecx ;?keep?a?copy?of?count

            cld
            cmp ecx,?TINY_BLOCK_COPY
            jb $memcpy_ic_3 ;?tiny??skip?mmx?copy

            cmp ecx,?32*1024 ;?don't?align?between?32k-64k?because
            jbe $memcpy_do_align ;??it?appears?to?be?slower
            cmp ecx,?64*1024
            jbe $memcpy_align_done
            $memcpy_do_align:
            mov ecx,?8 ;?a?trick?that's?faster?than?rep?movsb...
            sub ecx,?edi ;?align?destination?to?qword
            and ecx,?111b ;?get?the?low?bits
            sub ebx,?ecx ;?update?copy?count
            neg ecx ;?set?up?to?jump?into?the?array
            add ecx,?offset?$memcpy_align_done
            jmp ecx ;?jump?to?array?of?movsb's

            align?4
            movsb
            movsb
            movsb
            movsb
            movsb
            movsb
            movsb
            movsb

            $memcpy_align_done: ;?destination?is?dword?aligned
            mov ecx,?ebx ;?number?of?bytes?left?to?copy
            shr ecx,?6 ;?get?64-byte?block?count
            jz $memcpy_ic_2 ;?finish?the?last?few?bytes

            cmp ecx,?IN_CACHE_COPY/64 ;?too?big?4?cache??use?uncached?copy
            jae $memcpy_uc_test

            //?This?is?small?block?copy?that?uses?the?MMX?registers?to?copy?8?bytes
            //?at?a?time.??It?uses?the?"unrolled?loop"?optimization,?and?also?uses
            //?the?software?prefetch?instruction?to?get?the?data?into?the?cache.
            align?16
            $memcpy_ic_1: ;?64-byte?block?copies,?in-cache?copy

            prefetchnta?[esi?+?(200*64/34+192)] ;?start?reading?ahead

            movq mm0,?[esi+0] ;?read?64?bits
            movq mm1,?[esi+8]
            movq [edi+0],?mm0 ;?write?64?bits
            movq [edi+8],?mm1 ;????note:??the?normal?movq?writes?the
            movq mm2,?[esi+16] ;????data?to?cache;?a?cache?line?will?be
            movq mm3,?[esi+24] ;????allocated?as?needed,?to?store?the?data
            movq [edi+16],?mm2
            movq [edi+24],?mm3
            movq mm0,?[esi+32]
            movq mm1,?[esi+40]
            movq [edi+32],?mm0
            movq [edi+40],?mm1
            movq mm2,?[esi+48]
            movq mm3,?[esi+56]
            movq [edi+48],?mm2
            movq [edi+56],?mm3

            add esi,?64 ;?update?source?pointer
            add edi,?64 ;?update?destination?pointer
            dec ecx ;?count?down
            jnz $memcpy_ic_1 ;?last?64-byte?block?

            $memcpy_ic_2:
            mov ecx,?ebx ;?has?valid?low?6?bits?of?the?byte?count
            $memcpy_ic_3:
            shr ecx,?2 ;?dword?count
            and ecx,?1111b ;?only?look?at?the?"remainder"?bits
            neg ecx ;?set?up?to?jump?into?the?array
            add ecx,?offset?$memcpy_last_few
            jmp ecx ;?jump?to?array?of?movsd's

            $memcpy_uc_test:
            cmp ecx,?UNCACHED_COPY/64 ;?big?enough??use?block?prefetch?copy
            jae $memcpy_bp_1

            $memcpy_64_test:
            or ecx,?ecx ;?tail?end?of?block?prefetch?will?jump?here
            jz $memcpy_ic_2 ;?no?more?64-byte?blocks?left

            //?For?larger?blocks,?which?will?spill?beyond?the?cache,?it's?faster?to
            //?use?the?Streaming?Store?instruction?MOVNTQ.???This?write?instruction
            //?bypasses?the?cache?and?writes?straight?to?main?memory.??This?code?also
            //?uses?the?software?prefetch?instruction?to?pre-read?the?data.
            align?16
            $memcpy_uc_1: ;?64-byte?blocks,?uncached?copy

            prefetchnta?[esi?+?(200*64/34+192)] ;?start?reading?ahead

            movq mm0,[esi+0] ;?read?64?bits
            add edi,64 ;?update?destination?pointer
            movq mm1,[esi+8]
            add esi,64 ;?update?source?pointer
            movq mm2,[esi-48]
            movntq [edi-64],?mm0 ;?write?64?bits,?bypassing?the?cache
            movq mm0,[esi-40] ;????note:?movntq?also?prevents?the?CPU
            movntq [edi-56],?mm1 ;????from?READING?the?destination?address
            movq mm1,[esi-32] ;????into?the?cache,?only?to?be?over-written
            movntq [edi-48],?mm2 ;????so?that?also?helps?performance
            movq mm2,[esi-24]
            movntq [edi-40],?mm0
            movq mm0,[esi-16]
            movntq [edi-32],?mm1
            movq mm1,[esi-8]
            movntq [edi-24],?mm2
            movntq [edi-16],?mm0
            dec ecx
            movntq [edi-8],?mm1
            jnz $memcpy_uc_1 ;?last?64-byte?block?

            jmp $memcpy_ic_2 ;?almost?done

            //?For?the?largest?size?blocks,?a?special?technique?called?Block?Prefetch
            //?can?be?used?to?accelerate?the?read?operations.???Block?Prefetch?reads
            //?one?address?per?cache?line,?for?a?series?of?cache?lines,?in?a?short?loop.
            //?This?is?faster?than?using?software?prefetch,?in?this?case.
            //?The?technique?is?great?for?getting?maximum?read?bandwidth,
            //?especially?in?DDR?memory?systems.
            $memcpy_bp_1: ;?large?blocks,?block?prefetch?copy

            cmp ecx,?CACHEBLOCK ;?big?enough?to?run?another?prefetch?loop?
            jl $memcpy_64_test ;?no,?back?to?regular?uncached?copy

            mov eax,?CACHEBLOCK?/?2 ;?block?prefetch?loop,?unrolled?2X
            add esi,?CACHEBLOCK?*?64 ;?move?to?the?top?of?the?block
            align?16
            $memcpy_bp_2:
            mov edx,?[esi-64] ;?grab?one?address?per?cache?line
            mov edx,?[esi-128] ;?grab?one?address?per?cache?line
            sub esi,?128 ;?go?reverse?order
            dec eax ;?count?down?the?cache?lines
            jnz $memcpy_bp_2 ;?keep?grabbing?more?lines?into?cache

            mov eax,?CACHEBLOCK ;?now?that?it's?in?cache,?do?the?copy
            align?16
            $memcpy_bp_3:
            movq mm0,?[esi???] ;?read?64?bits
            movq mm1,?[esi+?8]
            movq mm2,?[esi+16]
            movq mm3,?[esi+24]
            movq mm4,?[esi+32]
            movq mm5,?[esi+40]
            movq mm6,?[esi+48]
            movq mm7,?[esi+56]
            add esi,?64 ;?update?source?pointer
            movntq [edi???],?mm0 ;?write?64?bits,?bypassing?cache
            movntq [edi+?8],?mm1 ;????note:?movntq?also?prevents?the?CPU
            movntq [edi+16],?mm2 ;????from?READING?the?destination?address?
            movntq [edi+24],?mm3 ;????into?the?cache,?only?to?be?over-written,
            movntq [edi+32],?mm4 ;????so?that?also?helps?performance
            movntq [edi+40],?mm5
            movntq [edi+48],?mm6
            movntq [edi+56],?mm7
            add edi,?64 ;?update?dest?pointer

            dec eax ;?count?down

            jnz $memcpy_bp_3 ;?keep?copying
            sub ecx,?CACHEBLOCK ;?update?the?64-byte?block?count
            jmp $memcpy_bp_1 ;?keep?processing?chunks

            //?The?smallest?copy?uses?the?X86?"movsd"?instruction,?in?an?optimized
            //?form?which?is?an?"unrolled?loop".???Then?it?handles?the?last?few?bytes.
            align?4
            movsd
            movsd ;?perform?last?1-15?dword?copies
            movsd
            movsd
            movsd
            movsd
            movsd
            movsd
            movsd
            movsd ;?perform?last?1-7?dword?copies
            movsd
            movsd
            movsd
            movsd
            movsd
            movsd

            $memcpy_last_few: ;?dword?aligned?from?before?movsd's
            mov ecx,?ebx ;?has?valid?low?2?bits?of?the?byte?count
            and ecx,?11b ;?the?last?few?cows?must?come?home
            jz $memcpy_final ;?no?more,?let's?leave
            rep movsb ;?the?last?1,?2,?or?3?bytes

            $memcpy_final:?
            emms ;?clean?up?the?MMX?state
            sfence ;?flush?the?write?buffer
            mov eax,?[dest] ;?ret?value?=?destination?pointer

            ????}
            }

            posted on 2007-01-02 14:12 @王一偉 閱讀(1349) 評(píng)論(2)  編輯 收藏 引用

            Feedback

            # re: amd的Memcpy函數(shù) 2007-12-29 16:44 Solomon Joh
            弓雖!全是asm啊,頭大中……  回復(fù)  更多評(píng)論
              

            # re: amd的Memcpy函數(shù) 2007-12-30 08:04 @王一偉
            呵呵 粘帖過來的 還沒來得及研究呢
            有時(shí)間研究下可以探討探討

            指令一般般多拉 哈哈  回復(fù)  更多評(píng)論
              


            只有注冊(cè)用戶登錄后才能發(fā)表評(píng)論。
            網(wǎng)站導(dǎo)航: 博客園   IT新聞   BlogJava   博問   Chat2DB   管理


            欧美一区二区三区久久综合| 国产精品成人99久久久久 | 色综合久久夜色精品国产| 三级韩国一区久久二区综合| 久久人妻无码中文字幕| 久久99毛片免费观看不卡| 欧美精品一区二区久久| 久久精品亚洲日本波多野结衣 | 无码乱码观看精品久久| 伊人久久大香线焦综合四虎| 欧美激情精品久久久久久| MM131亚洲国产美女久久| 区久久AAA片69亚洲| 2021国产成人精品久久| 久久久一本精品99久久精品66| 久久有码中文字幕| 国内精品久久久久| 久久久一本精品99久久精品88| 污污内射久久一区二区欧美日韩| 国产69精品久久久久99| 97久久精品无码一区二区| 97香蕉久久夜色精品国产 | 2021久久国自产拍精品| 成人久久免费网站| 亚洲国产精品成人久久蜜臀| 久久se这里只有精品| 国产精品狼人久久久久影院 | 成人久久综合网| 亚洲香蕉网久久综合影视 | 国产一级做a爰片久久毛片| 中文字幕无码精品亚洲资源网久久| 欧美亚洲另类久久综合婷婷 | 色偷偷偷久久伊人大杳蕉| 久久久国产99久久国产一| 三级韩国一区久久二区综合| 久久国产精品波多野结衣AV| 97久久精品人人澡人人爽| 国产精品xxxx国产喷水亚洲国产精品无码久久一区 | MM131亚洲国产美女久久| 久久青青草原亚洲av无码app| 97精品依人久久久大香线蕉97|