download the code
昨天實現了基于int類型的strstr函數,可以獲得1~2X左右的加速。今天按 lstrstr的流程實現了基于SSE2的STRSTR函數。可以得到2~4X左右的加速。
昨天實現了基于int類型的strstr函數,可以獲得1~2X左右的加速。今天按 lstrstr的流程實現了基于SSE2的STRSTR函數。可以得到2~4X左右的加速。
1 char* lstrstrsse(char* text, char* pattern)
2 {
3 __m128i * sseiPtr = (__m128i *) text;
4 unsigned char * chPtrAligned = (unsigned char*)text;
5 __m128i sseiWord0 ;//= *sseiPtr ;
6 __m128i sseiWord1 ;//= *sseiPtr ;
7 __m128i sseiZero = _mm_set1_epi8(0);
8 char chara = pattern[0];
9 char charb = pattern[1];
10 register __m128i byte16a;
11 register __m128i byte16b;
12 char* bytePtr =text;
13 if(pattern ==NULL) return NULL;
14 if(pattern[0] == 0) return NULL;
15 if(pattern[1] == 0) return lstrchr(text,pattern[0]);
16 byte16a = _mm_set1_epi8(chara);
17 byte16b = _mm_set1_epi8(charb);
18 // process the unaligned bytes
19
20 // the aligned bytes
21 alignStart:
22 sseiWord0 = *sseiPtr;
23 sseiWord1 = *(sseiPtr+1);
24 while( haszeroByte(sseiWord0,sseiWord1,sseiZero) ==0)
25 {
26 unsigned int reta ;
27 searcha:
28 reta = hasByteC(sseiWord0,sseiWord1, byte16a);
29 if(reta!=0 ) {
30 unsigned int retb ;
31 findouta:
32 retb = hasByteC(sseiWord0,sseiWord1, byte16b);
33 findoutb:
34 if(((reta<<1) & retb)){
35 // have ab
36 int i=1;
37 char * bytePtr0 = (char*) ( sseiPtr );
38 int j;
39 //printf("test::%0x,%d\n",reta ,bytePtr0 -text);
40 bytePtr = (char*) ( sseiPtr );
41 for(j =0;j<8;j++){
42 if(reta & 0xff) {
43 if(bytePtr0[0] == chara){
44 i =1;
45 bytePtr = bytePtr0 ;
46 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
47 if(pattern[i] == 0) return bytePtr;
48 }
49 if(bytePtr0[1] == chara){
50 i =1;
51 bytePtr = bytePtr0 + 1;
52 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
53 if(pattern[i] == 0) return bytePtr;
54 }
55 if(bytePtr0[2] == chara){
56 i =1;
57 bytePtr = bytePtr0 + 2;
58 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
59 if(pattern[i] == 0) return bytePtr;
60 }
61 if(bytePtr0[3] == chara){
62 i =1;
63 bytePtr = bytePtr0 + 3;
64 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
65 if(pattern[i] == 0) return bytePtr;
66 }
67 }
68 reta = reta >> 4;
69 bytePtr0 += 4;
70 }
71 }
72 // search b
73 sseiPtr += 2;
74 sseiWord0 = *sseiPtr;
75 sseiWord1 = *(sseiPtr+1);
76
77 while( haszeroByte(sseiWord0,sseiWord1,sseiZero) ==0){
78 retb = hasByteC(sseiWord0,sseiWord1, byte16b);
79 if(retb !=0){
80 // findout b
81 if((*((char*) sseiPtr)) == charb){
82 //b000
83 char * bytePtr = ((char*) ( sseiPtr )) -1;
84 if(bytePtr[0] == chara){
85 int i=1;
86 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
87 if(pattern[i] == 0) return bytePtr;
88 if(bytePtr[i] == 0) return NULL;
89 }
90
91 }
92 reta = hasByteC(sseiWord0,sseiWord1, byte16a);
93 if(reta !=0)
94 goto findoutb;
95 else{
96 goto nextWord;
97 }
98 }
99 sseiPtr += 2;
100 sseiWord0 = *sseiPtr;
101 sseiWord1 = *(sseiPtr+1);
102 }
103 // search from (char*)sseiPtr
104 char * bytePtr = ((char*) ( sseiPtr )) -1;
105 if(bytePtr[0] == chara){
106 int i=1;
107 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
108 if(pattern[i] == 0) return bytePtr;
109 }
110
111 goto prePareForEnd;
112 }
113 nextWord:
114 sseiPtr += 2;
115 sseiWord0 = *sseiPtr;
116 sseiWord1 = *(sseiPtr+1);
117 }
118 prePareForEnd:
119 {
120 unsigned int reta;
121 unsigned int retb;
122 reta =hasByteC(sseiWord0,sseiWord1, byte16a);
123 retb =hasByteC(sseiWord0,sseiWord1, byte16b);
124 if(((reta<<1) & retb)){
125 bytePtr = (char*)sseiPtr;
126 while(*bytePtr){
127 if(*bytePtr == chara) {
128 int i=1;
129 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
130 if(pattern[i] == 0) return bytePtr;
131 if(bytePtr[i] == 0) return NULL;
132
133 }
134 bytePtr++;
135 }
136 }
137 }
138 return NULL;
139 }
140
2 {
3 __m128i * sseiPtr = (__m128i *) text;
4 unsigned char * chPtrAligned = (unsigned char*)text;
5 __m128i sseiWord0 ;//= *sseiPtr ;
6 __m128i sseiWord1 ;//= *sseiPtr ;
7 __m128i sseiZero = _mm_set1_epi8(0);
8 char chara = pattern[0];
9 char charb = pattern[1];
10 register __m128i byte16a;
11 register __m128i byte16b;
12 char* bytePtr =text;
13 if(pattern ==NULL) return NULL;
14 if(pattern[0] == 0) return NULL;
15 if(pattern[1] == 0) return lstrchr(text,pattern[0]);
16 byte16a = _mm_set1_epi8(chara);
17 byte16b = _mm_set1_epi8(charb);
18 // process the unaligned bytes
19

20 // the aligned bytes
21 alignStart:
22 sseiWord0 = *sseiPtr;
23 sseiWord1 = *(sseiPtr+1);
24 while( haszeroByte(sseiWord0,sseiWord1,sseiZero) ==0)
25 {
26 unsigned int reta ;
27 searcha:
28 reta = hasByteC(sseiWord0,sseiWord1, byte16a);
29 if(reta!=0 ) {
30 unsigned int retb ;
31 findouta:
32 retb = hasByteC(sseiWord0,sseiWord1, byte16b);
33 findoutb:
34 if(((reta<<1) & retb)){
35 // have ab
36 int i=1;
37 char * bytePtr0 = (char*) ( sseiPtr );
38 int j;
39 //printf("test::%0x,%d\n",reta ,bytePtr0 -text);
40 bytePtr = (char*) ( sseiPtr );
41 for(j =0;j<8;j++){
42 if(reta & 0xff) {
43 if(bytePtr0[0] == chara){
44 i =1;
45 bytePtr = bytePtr0 ;
46 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
47 if(pattern[i] == 0) return bytePtr;
48 }
49 if(bytePtr0[1] == chara){
50 i =1;
51 bytePtr = bytePtr0 + 1;
52 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
53 if(pattern[i] == 0) return bytePtr;
54 }
55 if(bytePtr0[2] == chara){
56 i =1;
57 bytePtr = bytePtr0 + 2;
58 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
59 if(pattern[i] == 0) return bytePtr;
60 }
61 if(bytePtr0[3] == chara){
62 i =1;
63 bytePtr = bytePtr0 + 3;
64 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
65 if(pattern[i] == 0) return bytePtr;
66 }
67 }
68 reta = reta >> 4;
69 bytePtr0 += 4;
70 }
71 }
72 // search b
73 sseiPtr += 2;
74 sseiWord0 = *sseiPtr;
75 sseiWord1 = *(sseiPtr+1);
76
77 while( haszeroByte(sseiWord0,sseiWord1,sseiZero) ==0){
78 retb = hasByteC(sseiWord0,sseiWord1, byte16b);
79 if(retb !=0){
80 // findout b
81 if((*((char*) sseiPtr)) == charb){
82 //b000
83 char * bytePtr = ((char*) ( sseiPtr )) -1;
84 if(bytePtr[0] == chara){
85 int i=1;
86 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
87 if(pattern[i] == 0) return bytePtr;
88 if(bytePtr[i] == 0) return NULL;
89 }
90
91 }
92 reta = hasByteC(sseiWord0,sseiWord1, byte16a);
93 if(reta !=0)
94 goto findoutb;
95 else{
96 goto nextWord;
97 }
98 }
99 sseiPtr += 2;
100 sseiWord0 = *sseiPtr;
101 sseiWord1 = *(sseiPtr+1);
102 }
103 // search from (char*)sseiPtr
104 char * bytePtr = ((char*) ( sseiPtr )) -1;
105 if(bytePtr[0] == chara){
106 int i=1;
107 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
108 if(pattern[i] == 0) return bytePtr;
109 }
110
111 goto prePareForEnd;
112 }
113 nextWord:
114 sseiPtr += 2;
115 sseiWord0 = *sseiPtr;
116 sseiWord1 = *(sseiPtr+1);
117 }
118 prePareForEnd:
119 {
120 unsigned int reta;
121 unsigned int retb;
122 reta =hasByteC(sseiWord0,sseiWord1, byte16a);
123 retb =hasByteC(sseiWord0,sseiWord1, byte16b);
124 if(((reta<<1) & retb)){
125 bytePtr = (char*)sseiPtr;
126 while(*bytePtr){
127 if(*bytePtr == chara) {
128 int i=1;
129 while((pattern[i] )&&(bytePtr[i] == pattern[i])) i++;
130 if(pattern[i] == 0) return bytePtr;
131 if(bytePtr[i] == 0) return NULL;
132
133 }
134 bytePtr++;
135 }
136 }
137 }
138 return NULL;
139 }
140