• <ins id="pjuwb"></ins>
    <blockquote id="pjuwb"><pre id="pjuwb"></pre></blockquote>
    <noscript id="pjuwb"></noscript>
          <sup id="pjuwb"><pre id="pjuwb"></pre></sup>
            <dd id="pjuwb"></dd>
            <abbr id="pjuwb"></abbr>

            qiezi的學習園地

            AS/C/C++/D/Java/JS/Python/Ruby

              C++博客 :: 首頁 :: 新隨筆 ::  ::  :: 管理 ::
            周末抽空做了點小測試,根據http://blog.vckbase.com/jzhang/archive/2006/03/28/18807.html中m網友修改的算法,python版本中讀取所有行以后就做一個排序,再去除重復項。這個算法在我的機器上執行時間是1735ms左右,屬于python版本中最快的一個。

            D版本暫還沒想到有更優化的做法,D在處理以char[]作key的關聯數組時,判斷方法是先判斷hash,如果hash相等,則繼續做字符串判斷。它執行時間是1120ms左右。

            以D版本為基礎,自己寫了一個C++的Email類:

            class?Email
            {
            private:
            ????????
            string?mail;
            ????????size_t?hash;
            ????????friend?
            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs);
            public:
            ????????Email?(
            const?char*?mail_)
            ????????????????:?mail(mail_),?hash(my_hash(mail_))
            ????????{
            ????????}
            };

            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs)
            {
            ????????
            if?(lhs.hash?==?rhs.hash)
            ????????????????
            return?lhs.mail?<?rhs.mail;
            ????????
            return?lhs.hash?<?rhs.hash;
            }

            把它插入set并判斷是否有重復。

            這個程序由于string的大量拷貝,以及大量內存分配,執行時間相當長,在我的機器上是5s左右。D和python版本由于對象拷貝成本較低,加上都有內存分配策略,自然有一些優勢。

            退而求其次,既然hash沖突的幾率較低,試試只保存hash:

            class?Email
            {
            private:
            ????????size_t?hash;
            ????????friend?
            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs);
            public:
            ????????Email?(
            const?char*?mail_)
            ????????????????:?hash(my_hash(mail_))
            ????????{
            ????????}
            };

            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs)
            {
            ????????
            return?lhs.hash?<?rhs.hash;
            }

            這次測試就比較快了,耗時僅1020ms左右,比D版本還要快,當然它不是完善的版本。

            考慮到構造成本,于是改為只用一個set<int>來保存hash值再測試,這次耗時是930ms。

            實際上可以做一個改進的C++版本,一次性讀入文件的全部內容到一個大緩沖區,把所有的\n字符修改為\0,用一個動態數組保存緩沖區的所有字符串指針,hash值也須計算并保存到數組。再用D的索引方式,先hash比較,再字符串比較,效率應該也不低。

            實現了一個:
            #include?<iostream>
            #include?
            <string>
            #include?
            <set>
            #include?
            <fstream>
            #include?
            <iterator>
            #include?
            <sys/time.h>
            using?namespace?std;


            size_t?my_hash?(
            const?char*?str)
            {
            ????????size_t?ret?
            =?0;
            ????????
            while?(*str)
            ????????????????ret?
            =?11?*?ret?+?*str++;
            ????????
            return?ret;
            }

            class?Email
            {
            private:
            ????????size_t?hash;
            ????????
            const?char*?mail;
            ????????friend?
            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs);
            public:
            ????????Email?(
            const?char*?mail_)
            ????????????????:?hash(my_hash(mail_)),?mail(mail_)
            ????????{
            ????????}
            };

            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs)
            {
            ????????
            if?(lhs.hash?==?rhs.hash)
            ????????????????
            return?strcmp(lhs.mail,?rhs.mail)?<?0;
            ????????
            return?lhs.hash?<?rhs.hash;
            }

            int?main(int?argc,?char**?argv)
            {
            ????????
            if?(argc?<?3)
            ????????{
            ????????????????cout?
            <<?"Wrong?arguments"?<<?endl;
            ????????????????
            return?1;
            ????????}

            ????????FILE
            *?fin?=?fopen(argv[1],?"r");
            ????????
            if?(!fin)
            ????????{
            ????????????????cout?
            <<?"Invalid?input?file"?<<?endl;
            ????????????????
            return?2;
            ????????}
            ????????FILE
            *?fout?=?fopen(argv[2],?"w");
            ????????
            if?(!fout)
            ????????{
            ????????????????fclose(fin);
            ????????????????cout?
            <<?"Invalid?output?file"?<<?endl;
            ????????????????
            return?3;
            ????????}

            ????????timeval?start,?end;

            ????????
            const?int?BUF_SIZE?=?20?*?1024?*?1024;
            ????????
            char*?buffer?=?new?char[BUF_SIZE];
            ????????memset(buffer,?
            0,?BUF_SIZE);

            ????????gettimeofday(
            &start,?0);
            ????????
            set<Email>?emails;

            ????????size_t?read?
            =?fread?(buffer,?BUF_SIZE,?1,?fin);
            ????????
            char*?begin?=?buffer;
            ????????
            char*?current?=?buffer;

            ????????
            while?(*current?!=?'\0')
            ????????{
            ????????????????
            if?(*current?==?'\n')
            ????????????????{
            ????????????????????????
            *current?=?'\0';
            ????????????????????????
            if?(emails.insert(begin).second){
            ????????????????????????????????fputs(begin,?fout);
            ????????????????????????????????fwrite(
            "\n",?1,?1,?fout);
            ????????????????????????}
            ????????????????????????begin?
            =?current?+?1;
            ????????????????}
            ????????????????
            ++?current;
            ????????}

            ??????? fclose(fout);
            ??????? fclose(fin);

            ????????gettimeofday(&end,?0);

            ????????printf(
            "Time?used:?%d?ms\n",?((end.tv_sec?-?start.tv_sec)?*?1000?+?(end.tv_usec?-?start.tv_usec)?/?1000));

            ??????? delete[] buffer;
            ????????
            return?0;
            }

            memset不是必須的,不過我不知道如何獲取讀入的大小。fread讀取后,如果讀到EOF,則返回值為0。所以我這里用memset先初始化內存,但不把它計入耗時。new操作也沒計入,因為其它語言比如python、D在啟動時都由運行時做了類似工作。

            這個程序在我的機器上耗時為1350ms左右。我想可慢在set上,對象拷貝?內存分配?

            做了幾個優化版本,沒多大提高。


            重新測試了下:
            A、python(m網友版本):
            lijie t # python test.py
            1689.0411377
            lijie t # python test.py
            1711.40599251
            lijie t # python test.py
            1699.63312149
            lijie t # python test.py
            1712.00013161
            lijie t # python test.py
            1713.8838768

            B、D版本:
            lijie t # ./testd email.txt email-new.txt
            1091
            lijie t # ./testd email.txt email-new.txt
            1070
            lijie t # ./testd email.txt email-new.txt
            1062
            lijie t # ./testd email.txt email-new.txt
            1062
            lijie t # ./testd email.txt email-new.txt
            1096

            C、C++只比較hash,set<Email>版本:
            lijie t # ./test3 email.txt email-new.txt
            Time used: 981 ms
            lijie t # ./test3 email.txt email-new.txt
            Time used: 1000 ms
            lijie t # ./test3 email.txt email-new.txt
            Time used: 980 ms
            lijie t # ./test3 email.txt email-new.txt
            Time used: 986 ms
            lijie t # ./test3 email.txt email-new.txt
            Time used: 987 ms

            D、C++只比較hash,set<int>版本:
            lijie t # ./test4 email.txt email-new.txt
            Time used: 951 ms
            lijie t # ./test4 email.txt email-new.txt
            Time used: 953 ms
            lijie t # ./test4 email.txt email-new.txt
            Time used: 947 ms
            lijie t # ./test4 email.txt email-new.txt
            Time used: 950 ms
            lijie t # ./test4 email.txt email-new.txt
            Time used: 962 ms

            E、C++大緩沖區,比較hash和字符串,set<Email>版本:
            lijie t # ./test5 email.txt email-new.txt
            Time used: 1375 ms
            lijie t # ./test5 email.txt email-new.txt
            Time used: 1359 ms
            lijie t # ./test5 email.txt email-new.txt
            Time used: 1369 ms
            lijie t # ./test5 email.txt email-new.txt
            Time used: 1378 ms
            lijie t # ./test5 email.txt email-new.txt
            Time used: 1396 ms

            F、C++大緩沖區,比較字符串版本:
            lijie t # ./test6 email.txt email-new.txt
            Time used: 1168 ms
            lijie t # ./test6 email.txt email-new.txt
            Time used: 1169 ms
            lijie t # ./test6 email.txt email-new.txt
            Time used: 1171 ms
            lijie t # ./test6 email.txt email-new.txt
            Time used: 1179 ms
            lijie t # ./test6 email.txt email-new.txt
            Time used: 1169 ms

            從C、E和F來看,對象拷貝成本是比較高的,E版本僅僅比C版本多了個const char*成員變量,hash值比較散,很少會真的執行到strcmp。保持E版本對象結構不變,把operator <里面的實現改為C版本,測試結果如下:
            lijie t # ./test5 email.txt email-new.txt
            Time used: 1355 ms
            lijie t # ./test5 email.txt email-new.txt
            Time used: 1360 ms
            lijie t # ./test5 email.txt email-new.txt
            Time used: 1348 ms
            lijie t # ./test5 email.txt email-new.txt
            Time used: 1353 ms
            lijie t # ./test5 email.txt email-new.txt
            Time used: 1379 ms

            效率只提高了一點點,這個版本僅僅比C版本多了個成員變量拷貝,竟然慢了這么多。說明Email對象的2個成員變量拷貝成本的確很高。

            F版本相比之下反而效率很不錯,主要原因是email數據不夠復雜,僅通過前幾位就可以比較出結果。如果每行數據比較長,而且很多行要到后幾個字符才能比較出來,肯定就不那么快了。

            hash值的計算雖然執行了一系列乘法,不過還是相當迅速。

            D語言版本執行了hash值和字符串比較,是比較完善的,效率很不錯。C++相應版本看來要提高set的效率才能達到。


            jzhang的第一個python版本在我的機器上執行如下:
            lijie t # python test2.py
            3122.9569912 ms
            lijie t # python test2.py
            3209.42997932 ms
            lijie t # python test2.py
            3141.47305489 ms
            lijie t # python test2.py
            3129.57286835 ms
            lijie t # python test2.py
            3196.03514671 ms

            我做了點修改,執行速度提高了一些:
            #remove?duplicated?email?address?from?file
            import?datetime
            from?time?import?time
            if?__name__?==?"__main__":
            ?start?
            =?time()
            ?hashtable?
            =?{}
            ?f?
            =?file("email.txt","r")
            ?f2?
            =?file("email_new.txt","w")
            ?
            for?line?in?f.xreadlines():
            ??
            if?not?hashtable.has_key(line):
            ???hashtable[line]?
            =?1
            ???f2.write(line)
            ?f.close()
            ?f2.close()
            ?print?(time()?
            -?start)?*?1000,?"ms"
            在我的機器上執行結果如下:
            lijie t # python test1.py
            2239.22801018 ms
            lijie t # python test1.py
            2301.00703239 ms
            lijie t # python test1.py
            2282.06086159 ms
            lijie t # python test1.py
            2296.57006264 ms
            lijie t # python test1.py
            2281.25810623 ms

            不過還是沒有m網友的效率高。


            在F版本的基礎上,借鑒m網友的做法,實現一個G版本:

            G、排序并去除重復元素,比較hash和字符串版本:
            #include?<iostream>
            #include?
            <string>
            #include?
            <fstream>
            #include?
            <iterator>
            #include?
            <sys/time.h>
            #include?
            <vector>
            using?namespace?std;


            size_t?my_hash?(
            const?char*?str)
            {
            ????????size_t?ret?
            =?0;
            ????????
            while?(*str)
            ????????????????ret?
            =?11?*?ret?+?*str++;
            ????????
            return?ret;
            }

            class?Email
            {
            private:
            ????????size_t?hash;
            ????????
            const?char*?mail;
            ????????friend?
            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs);
            public:
            ????????Email?(
            const?char*?mail_)
            ????????????????:?hash(my_hash(mail_)),?mail(mail_)
            ????????{
            ????????}

            ????????
            bool?operator?==?(const?Email&?rhs)
            ????????{
            ????????????????
            if?(hash?==?rhs.hash)
            ????????????????????????
            return?strcmp(mail,?rhs.mail)?==?0;
            ????????????????
            return?false;
            ????????}

            ????????
            const?char*?getEmail()const
            ????????{
            ????????????????
            return?mail;
            ????????}
            };

            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs)
            {
            ????????
            if?(lhs.hash?==?rhs.hash)
            ????????????????
            return?strcmp(lhs.mail,?rhs.mail)?<?0;
            ????????
            return?lhs.hash?<?rhs.hash;
            }

            int?main(int?argc,?char**?argv)
            {
            ????????
            if?(argc?<?3)
            ????????{
            ????????????????cout?
            <<?"Wrong?arguments"?<<?endl;
            ????????????????
            return?1;
            ????????}

            ????????FILE
            *?fin?=?fopen(argv[1],?"r");
            ????????
            if?(!fin)
            ????????{
            ????????????????cout?
            <<?"Invalid?input?file"?<<?endl;
            ????????????????
            return?2;
            ????????}
            ????????FILE
            *?fout?=?fopen(argv[2],?"w");
            ????????
            if?(!fout)
            ????????{
            ????????????????fclose(fin);
            ????????????????cout?
            <<?"Invalid?output?file"?<<?endl;
            ????????????????
            return?3;
            ????????}

            ????????timeval?start,?end;

            ????????
            const?int?BUF_SIZE?=?20?*?1024?*?1024;
            ????????
            char*?buffer?=?new?char[BUF_SIZE];
            ????????memset(buffer,?
            0,?BUF_SIZE);

            ????????gettimeofday(
            &start,?0);
            ????????vector
            <Email>?emails;

            ????????size_t?read?
            =?fread?(buffer,?BUF_SIZE,?1,?fin);
            ????????
            char*?begin?=?buffer;
            ????????
            char*?current?=?buffer;

            ????????
            while?(*current?!=?'\0')
            ????????{
            ????????????????
            if?(*current?==?'\n')
            ????????????????{
            ????????????????????????
            *current?=?'\0';
            ????????????????????????emails.push_back(begin);
            ????????????????????????begin?
            =?current?+?1;
            ????????????????}
            ????????????????
            ++?current;
            ????????}
            ????????fclose(fin);
            ????????sort(emails.begin(),?emails.end());
            ????????emails.erase?(unique(?emails.begin(),?emails.end()?),?emails.end());

            ????????
            for?(vector<Email>::const_iterator?iter?=?emails.begin();
            ?????????????iter?
            !=?emails.end();
            ?????????????iter?
            ++)
            ????????{
            ????????????????fputs((
            *iter).getEmail(),?fout);
            ????????????????fwrite(
            "\n",?1,?1,?fout);
            ????????}

            ????????fclose(fout);

            ????????gettimeofday(
            &end,?0);

            ????????printf(
            "Time?used:?%d?ms\n",?((end.tv_sec?-?start.tv_sec)?*?1000?+?(end.tv_usec?-?start.tv_usec)?/?1000));

            ????????delete[]?buffer;

            ????????
            return?0;
            }

            在我的機器上執行如下:
            lijie t # ./test7 email.txt email-new.txt
            Time used: 676 ms
            lijie t # ./test7 email.txt email-new.txt
            Time used: 675 ms
            lijie t # ./test7 email.txt email-new.txt
            Time used: 671 ms
            lijie t # ./test7 email.txt email-new.txt
            Time used: 669 ms
            lijie t # ./test7 email.txt email-new.txt
            Time used: 673 ms

            比F版本快了2倍,也快過了其它所有版本。不過由于數據是vector保存的,在數據大量重復的情況下,性能可能會有較大的降低。

            把operator<和operator==的實現改為strcmp比較,執行結果如下:
            lijie t # ./test8 email.txt email-new.txt
            Time used: 1275 ms
            lijie t # ./test8 email.txt email-new.txt
            Time used: 1267 ms
            lijie t # ./test8 email.txt email-new.txt
            Time used: 1297 ms
            lijie t # ./test8 email.txt email-new.txt
            Time used: 1296 ms
            lijie t # ./test8 email.txt email-new.txt
            Time used: 1271 ms


            修改了下,增加了計時,修正了fread使用錯誤。

            #include?<iostream>
            #include?
            <string>
            #include?
            <fstream>
            #include?
            <iterator>
            #include?
            <vector>
            using?namespace?std;

            #ifdef?_WIN32
            #?include?
            <windows.h>
            #else?//?_WIN32
            #?include?
            <sys/time.h>
            #endif?//?_WIN32


            size_t?my_hash?(
            const?char*?str)
            {
            ????????size_t?ret?
            =?0;
            ????????
            while?(*str)
            ????????????????ret?
            =?11?*?ret?+?*str++;
            ????????
            return?ret;
            }

            class?Email
            {
            private:
            ????????size_t?hash;
            ????????
            const?char*?mail;
            ????????friend?
            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs);
            public:
            ????????Email?(
            const?char*?mail_)
            ????????????????:?hash(my_hash(mail_)),?mail(mail_)
            ????????{
            ????????}

            ????????
            bool?operator?==?(const?Email&?rhs)
            ????????{
            ????????????????
            if?(hash?==?rhs.hash)
            ????????????????????????
            return?strcmp(mail,?rhs.mail)?==?0;
            ????????????????
            return?false;
            ????????}

            ????????
            const?char*?getEmail()const
            ????????{
            ????????????????
            return?mail;
            ????????}
            };

            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs)
            {
            ????????
            if?(lhs.hash?==?rhs.hash)
            ????????????????
            return?strcmp(lhs.mail,?rhs.mail)?<?0;
            ????????
            return?lhs.hash?<?rhs.hash;
            }

            #ifndef?_WIN32
            class?Timer
            {
            ????????timeval?begin,?end;
            public:
            ????????
            void?start?()?{gettimeofday(&begin,?0);}
            ????????
            void?stop?()?{gettimeofday(&end,?0);}
            ????????size_t?milliseconds?()?
            const?{
            ????????????????
            return?(end.tv_sec?-?begin.tv_sec)?*?1000?+?(end.tv_usec?-?begin.tv_usec)?/?1000;
            ????????}
            };
            #else?//?_WIN32
            class?Timer
            {
            ????????DWORD?begin,?end;
            public:
            ????????
            void?start?()?{begin?=?GetTickCount();}
            ????????
            void?stop?()?{end?=?GetTickCount();}
            ????????size_t?milliseconds?()?
            const?{
            ????????????????
            return?end?-?begin;
            ????????}
            };
            #endif?//?_WIN32


            int?main(int?argc,?char**?argv)
            {
            ????????
            if?(argc?<?3)
            ????????{
            ????????????????cout?
            <<?"Wrong?arguments"?<<?endl;
            ????????????????
            return?1;
            ????????}

            ????????
            for?(int?i=0;?i<10;?++i)?{

            ???????FILE
            *?fin?=?fopen(argv[1],?"r");
            ????????
            if?(!fin)
            ????????{
            ????????????????cout?
            <<?"Invalid?input?file"?<<?endl;
            ????????????????
            return?2;
            ????????}
            ????????FILE
            *?fout?=?fopen(argv[2],?"w");
            ????????
            if?(!fout)
            ????????{
            ????????????????fclose(fin);
            ????????????????cout?
            <<?"Invalid?output?file"?<<?endl;
            ????????????????
            return?3;
            ????????}

            ????????Timer?total,?part;
            ????????total.start();
            ????????part.start();

            ????????
            const?int?BUF_SIZE?=?20?*?1024?*?1024;

            ????????
            char*?buffer?=?new?char[BUF_SIZE];
            ????????
            if?(!buffer){
            ????????????????
            //?cout?<<?"Alloc?buffer?failed"?<<?endl;
            ????????????????return?4;
            ????????}

            ????????part.stop();
            ????????
            //?cout?<<?"Alloc?buffer,?"?<<?part.milliseconds()?<<?"?ms?used."?<<?endl;
            ????????part.start();

            ????????size_t?read?
            =?fread?(buffer,?1,?BUF_SIZE,?fin);
            ????????fclose(fin);
            ????????buffer[read]?
            =?'\0';
            ????????part.stop();
            ????????
            //?cout?<<?"Read?file,?"?<<?part.milliseconds()?<<?"?ms?used."?<<?endl;
            ????????part.start();

            ????????vector
            <Email>?emails;

            ????????
            char*?begin?=?buffer;
            ????????
            char*?current?=?buffer;

            ????????
            while?(*current?!=?'\0')
            ????????{
            ????????????????
            if?(*current?==?'\n')
            ????????????????{
            ????????????????????????
            *current?=?'\0';
            ????????????????????????emails.push_back(begin);
            ????????????????????????begin?
            =?current?+?1;
            ????????????????}
            ????????????????
            ++?current;
            ????????}

            ????????part.stop();
            ????????
            //?cout?<<?"Put?emails?into?vector,?"?<<?part.milliseconds()?<<?"?ms?used."?<<?endl;
            ????????part.start();

            ????????sort(emails.begin(),?emails.end());
            ????????part.stop();
            ????????
            //?cout?<<?"Sort?emails,?"?<<?part.milliseconds()?<<?"?ms?used."?<<?endl;
            ????????part.start();

            ????????emails.erase?(unique(?emails.begin(),?emails.end()?),?emails.end());
            ????????part.stop();
            ????????
            //?cout?<<?"Unique?emails,?"?<<?part.milliseconds()?<<?"?ms?used."?<<?endl;
            ????????part.start();

            ????????
            for?(vector<Email>::const_iterator?iter?=?emails.begin();
            ?????????????iter?
            !=?emails.end();
            ?????????????iter?
            ++)
            ????????{
            ????????????????fputs((
            *iter).getEmail(),?fout);
            ????????????????fwrite(
            "\n",?1,?1,?fout);
            ????????}

            ????????fclose(fout);

            ????????part.stop();
            ????????
            //?cout?<<?"Write?emails?into?new?file,?"?<<?part.milliseconds()?<<?"?ms?used."?<<?endl;

            ????????total.stop();

            ????????cout?
            <<?"Total?used:?"?<<?total.milliseconds()?<<?"?ms."?<<?endl;

            ????????delete[]?buffer;
            ????????}

            ????????
            return?0;
            }

            使用“-O3 -fomit-frame-pointer -funroll-loops -mtune=pentium4”選項編譯,耗時從680ms減少到620ms


            優化文件讀寫版:
            #include?<iostream>
            #include?
            <string>
            #include?
            <fstream>
            #include?
            <iterator>
            #include?
            <vector>
            using?namespace?std;

            //?config
            #define?USE_CACHE
            //#define?PROFILE
            //?end?config


            #ifdef?_WIN32
            #?include?
            <windows.h>
            #else?//?_WIN32
            #?include?
            <sys/time.h>
            #endif?//?_WIN32

            #ifndef?_WIN32
            class?Timer
            {
            ????timeval?begin,?end;
            public:
            ????
            void?start?()?{gettimeofday(&begin,?0);}
            ????
            void?stop?()?{gettimeofday(&end,?0);}
            ????size_t?milliseconds?()?
            const?{
            ????????
            return?(end.tv_sec?-?begin.tv_sec)?*?1000?+?(end.tv_usec?-?begin.tv_usec)?/?1000;
            ????}
            };
            #else?//?_WIN32
            class?Timer
            {
            ????DWORD?begin,?end;
            public:
            ????
            void?start?()?{begin?=?GetTickCount();}
            ????
            void?stop?()?{end?=?GetTickCount();}
            ????size_t?milliseconds?()?
            const?{
            ????????
            return?end?-?begin;
            ????}
            };
            #endif?//?_WIN32


            #ifdef?PROFILE
            #?define?PROFILE_OUTPUT(timer,info)?\
            ????timer.stop();?\
            ????cout?
            <<?info?<<?":?"?<<?timer.milliseconds()?<<?"?ms?used."?<<?endl;?\
            ????timer.start()
            #else?//?PROFILE
            #?define?PROFILE_OUTPUT(timer,info)
            #endif?//?PROFILE



            size_t?my_hash?(
            const?char*?str)
            {
            ????size_t?ret?
            =?0;
            ????
            while?(*str)
            ????????????ret?
            =?11?*?ret?+?*str++;
            ????
            return?ret;
            }

            class?Email
            {
            private:
            ????size_t?hash;
            ????
            const?char*?mail;
            ????friend?
            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs);
            public:
            ????Email?(
            const?char*?mail_)
            ????????:?hash(my_hash(mail_)),?mail(mail_)
            ????{
            ????}

            ????
            bool?operator?==?(const?Email&?rhs)
            ????{
            ????????
            if?(hash?==?rhs.hash)
            ????????????
            return?strcmp(mail,?rhs.mail)?==?0;
            ????????
            return?false;
            ????}

            ????
            const?char*?getEmail()?const
            ????{
            ????????
            return?mail;
            ????}

            ????size_t?getLength()?
            const
            ????{
            ????????
            return?strlen(mail);
            ????}
            };

            bool?operator?<?(const?Email&?lhs,?const?Email&?rhs)
            {
            ????
            if?(lhs.hash?==?rhs.hash)
            ????????
            return?strcmp(lhs.mail,?rhs.mail)?<?0;
            ????
            return?lhs.hash?<?rhs.hash;
            }



            #ifdef?USE_CACHE
            class?OfstreamBuffer
            {
            ????ofstream
            &?ofs;
            ????size_t?buf_size;
            ????size_t?offset;
            ????
            char*?buffer;

            public:
            ????OfstreamBuffer?(ofstream
            &?ofs_,?size_t?buf_size_)
            ????????:?ofs(ofs_),?buf_size(buf_size_),?offset(
            0)
            ????{
            ????????buffer?
            =?new?char[buf_size_];
            ????}

            ????
            ~OfstreamBuffer?()
            ????{
            ????????flush();
            ????????delete[]?buffer;
            ????}

            ????
            void?write?(const?char*?ptr,?size_t?size)
            ????{
            ????????
            while?(size?>?0)
            ????????{
            ????????????size_t?copy_size?
            =?buf_size?-?offset;
            ????????????
            if?(copy_size?>?size)
            ????????????????copy_size?
            =?size;
            ????????????memcpy?(buffer?
            +?offset,?ptr,?copy_size);
            ????????????offset?
            +=?copy_size;
            ????????????ptr?
            +=?copy_size;
            ????????????size?
            -=?copy_size;

            ????????????
            if?(offset?==?buf_size)
            ????????????????flush?();
            ????????}
            ????}

            ????
            void?flush?()
            ????{
            ????????
            if?(offset?>?0)
            ????????{
            ????????????ofs.write(buffer,?offset);
            ????????????offset?
            =?0;
            ????????}
            ????????ofs.flush();
            ????}
            };
            #else//?USE_CACHE
            class?OfstreamBuffer
            {
            ????ofstream
            &?ofs;

            public:
            ????OfstreamBuffer?(ofstream
            &?ofs_,?size_t?buf_size_)
            ????????:?ofs(ofs_)
            ????{
            ????}

            ????
            void?write?(const?char*?ptr,?size_t?size)
            ????{
            ????????ofs.write(ptr,?size);
            ????}

            ????
            void?flush?()
            ????{
            ????????ofs.flush();
            ????}
            };
            #endif?//?USE_CACHE




            int?main(int?argc,?char**?argv)
            {
            ????
            if?(argc?<?3)
            ????{
            ????????cout?
            <<?"Wrong?arguments"?<<?endl;
            ????????
            return?1;
            ????}

            ????
            for?(int?i=0;?i<1;?++i)?{

            ????????ifstream?ifs(argv[
            1],?ios::binary);
            ????????
            if?(!ifs)
            ????????{
            ????????????cout?
            <<?"Invalid?input?file"?<<?endl;
            ????????????
            return?2;
            ????????}

            ????????ofstream?ofs(argv[
            2],?ios::binary);
            ????????
            if?(!ofs)
            ????????{
            ????????????cout?
            <<?"Invalid?output?file"?<<?endl;
            ????????????
            return?3;
            ????????}

            ????????Timer?total,?part;
            ????????total.start();
            ????????part.start();

            ????????ifs.seekg(
            0,?ios_base::end);
            ????????size_t?file_size?
            =?(size_t)ifs.tellg()?+?1;
            ????????cout?
            <<?"file?size:?"?<<?file_size?<<?endl;
            ????????ifs.seekg(
            0,?ios_base::beg);

            ????????
            char*?buffer?=?new?char[file_size];
            ????????
            if?(!buffer){
            ????????????cout?
            <<?"Alloc?buffer?failed"?<<?endl;
            ????????????
            return?4;
            ????????}

            ????????PROFILE_OUTPUT(part,?
            "Alloc?buffer");

            ????????ifs.read(buffer,?file_size);
            ????????buffer[file_size]?
            =?'\0';

            ????????PROFILE_OUTPUT(part,?
            "Read?file");

            ????????vector
            <Email>?emails;
            ????????emails.reserve(
            1000000);

            ????????
            char*?begin?=?buffer;
            ????????
            char*?current?=?buffer;

            ????????
            while?(*current?!=?'\0')
            ????????{
            ????????????
            if?(*current?==?'\n')
            ????????????{
            ????????????????
            *current?=?'\0';
            ????????????????emails.push_back(begin);
            ????????????????begin?
            =?current?+?1;
            ????????????}
            ????????????
            ++?current;
            ????????}

            ????????PROFILE_OUTPUT(part,?
            "Put?emails?into?vector");

            ????????sort(emails.begin(),?emails.end());

            ????????PROFILE_OUTPUT(part,?
            "Sort?emails");

            ????????emails.erase?(unique(?emails.begin(),?emails.end()?),?emails.end());

            ????????PROFILE_OUTPUT(part,?
            "Unique?emails");

            ????????OfstreamBuffer?ofsBuffer?(ofs,?
            4?*?1024);

            ????????
            for?(vector<Email>::const_iterator?iter?=?emails.begin();
            ?????????????iter?
            !=?emails.end();
            ?????????????iter?
            ++?)
            ????????{
            ????????????ofsBuffer.write(iter
            ->getEmail(),?iter->getLength());
            ????????????ofsBuffer.write(
            "\n",?1);
            ????????}
            ????????ofsBuffer.flush();

            ????????PROFILE_OUTPUT(part,?
            "Write?emails?into?new?file");

            ????????total.stop();
            ????????cout?
            <<?"Total?used:?"?<<?total.milliseconds()?<<?"?ms."?<<?endl;


            ????????delete[]?buffer;
            ????}

            ????
            return?0;
            }
            這個版本在windows上用dev-cpp所帶的gcc 3.4.2來編譯,最好成績是609ms。在cygwin里可以達到650ms。
            posted on 2006-04-03 11:00 qiezi 閱讀(2782) 評論(27)  編輯 收藏 引用 所屬分類: C++雜談
            久久久久无码精品国产不卡| 91麻豆精品国产91久久久久久 | 97久久国产亚洲精品超碰热| 亚洲国产成人精品女人久久久 | 亚洲熟妇无码另类久久久| 久久精品成人免费观看97| 亚洲国产精品热久久| 国产日产久久高清欧美一区| 精品久久8x国产免费观看| 久久人人爽人人爽人人片AV不| 久久精品国产亚洲AV久| 久久这里有精品| 国内精品久久久久影院薰衣草| 99久久无色码中文字幕人妻| 亚洲色大成网站WWW久久九九| 色妞色综合久久夜夜| 99久久人妻无码精品系列| 狠狠色丁香婷婷久久综合不卡| 久久国产精品久久久| 久久精品国产精品亜洲毛片 | 久久香蕉国产线看观看99| 精品久久久久久中文字幕| 国产精品成人99久久久久91gav| 久久www免费人成看国产片| 婷婷久久综合九色综合绿巨人| 国内高清久久久久久| 88久久精品无码一区二区毛片| 久久精品无码一区二区三区日韩 | 狠狠色丁香久久综合五月| 久久久久婷婷| 欧美黑人又粗又大久久久| 久久国产高清一区二区三区| 久久大香萑太香蕉av| 77777亚洲午夜久久多喷| 久久久久人妻一区精品果冻| 亚洲精品无码成人片久久| 色成年激情久久综合| 亚洲欧美伊人久久综合一区二区| 久久91综合国产91久久精品| 久久中文字幕人妻丝袜| 久久中文字幕一区二区|