DFA與捕獲和預查結合起來的話很麻煩,不能用一張表來迭代,而是要用回溯,然后在回溯的時候記下狀態(tài)。至此正則表達式的所有算法都完成了,接下來是詳細介紹。
每一個回溯的記錄都需要一張帶有所有捕獲軌跡的表(元素為堆棧的堆棧),給每一個記錄集一張表顯然很浪費,因此我們只用一張表來記錄。但是我們對表進行刪除項目的時候,又可能會影響到上一個回溯的數(shù)據(jù),因此我們要開發(fā)出一個新的堆棧,可以push,可以pop,而且pop之后再push不會破壞原有的東西,一直到外邊顯示指定跳到前一個狀態(tài)為止:
1 template<typename T, typename K>
2 void Push(List<T, K>& elements, int& available, int& count, const T& element)
3 {
4 if(elements.Count()==count)
5 {
6 elements.Add(element);
7 }
8 else
9 {
10 elements[count]=element;
11 }
12 T& current=elements[count];
13 current.previous=available;
14 available=count++;
15 }
16
17 template<typename T, typename K>
18 T Pop(List<T, K>& elements, int& available, int& count)
19 {
20 T& current=elements[available];
21 available=current.previous;
22 return current;
23 }
于是我們就可以對可以回溯的列表進行保存了。一開始available=-1,count=0。每次push的時候改變available和count,但是對于列表來說,0到count-1的東西都是安全的。pop的時候改變available,但是count保留不變,這個時候elements[count-1]可以通過elements[count-1].available(而不是參數(shù)的available)追溯到上一個項。這樣就把所有存在的堆棧都壓入到同一個列表里面去了,而且不同的堆棧可以通過不同的count訪問,一直到比較小的count決定push東西了,大的count對應的堆棧才會受到破壞。于是我們就用一個列表和一個count表組成了一個元素是堆棧的堆棧。
知道怎么操作之后下面的匹配代碼就很好理解了:
1 #include "RegexRich.h"
2
3 namespace vl
4 {
5 namespace regex_internal
6 {
7
8 /***********************************************************************
9 回溯輔助數(shù)據(jù)結構
10 ***********************************************************************/
11
12 class SaverBase
13 {
14 public:
15 bool available;
16 int previous;
17 };
18
19 class StateSaver
20 {
21 public:
22 enum StateStoreType
23 {
24 Positive,
25 Negative,
26 Other
27 };
28
29 const wchar_t* reading; //當前字符串位置
30 State* currentState; //當前狀態(tài)
31 int minTransition; //最小可用轉換
32 int captureCount; //有效capture數(shù)量
33 int stateSaverCount; //有效回溯狀態(tài)數(shù)量
34 int extensionSaverAvailable; //有效未封閉擴展功能數(shù)量
35 int extensionSaverCount; //所有未封閉擴展功能數(shù)量
36 StateStoreType storeType; //保存狀態(tài)的原因
37
38 bool operator==(const StateSaver& saver)
39 {
40 return
41 reading==saver.reading &&
42 currentState==saver.currentState &&
43 minTransition==saver.minTransition &&
44 captureCount==saver.captureCount;
45 }
46 };
47
48 class ExtensionSaver : public SaverBase
49 {
50 public:
51 int captureListIndex;
52 Transition* transition;
53 const wchar_t* reading;
54
55 bool operator==(const ExtensionSaver& saver)
56 {
57 return
58 captureListIndex==saver.captureListIndex &&
59 transition==saver.transition &&
60 reading==saver.reading;
61 }
62 };
63
64 template<typename T, typename K>
65 void Push(List<T, K>& elements, int& available, int& count, const T& element)
66 {
67 if(elements.Count()==count)
68 {
69 elements.Add(element);
70 }
71 else
72 {
73 elements[count]=element;
74 }
75 T& current=elements[count];
76 current.previous=available;
77 available=count++;
78 }
79
80 template<typename T, typename K>
81 T Pop(List<T, K>& elements, int& available, int& count)
82 {
83 T& current=elements[available];
84 available=current.previous;
85 return current;
86 }
87
88 template<typename T, typename K>
89 void PushNonSaver(List<T, K>& elements, int& count, const T& element)
90 {
91 if(elements.Count()==count)
92 {
93 elements.Add(element);
94 }
95 else
96 {
97 elements[count]=element;
98 }
99 count++;
100 }
101
102 template<typename T, typename K>
103 T PopNonSaver(List<T, K>& elements, int& count)
104 {
105 return elements[--count];
106 }
107 }
108
109 template<>
110 struct POD<regex_internal::StateSaver>
111 {
112 static const bool Result=true;
113 };
114
115 template<>
116 struct POD<regex_internal::ExtensionSaver>
117 {
118 static const bool Result=true;
119 };
120
121 namespace regex_internal
122 {
123 /***********************************************************************
124 CaptureRecord
125 ***********************************************************************/
126
127 bool CaptureRecord::operator==(const CaptureRecord& record)
128 {
129 return capture==record.capture && start==record.start && length==record.length;
130 }
131
132 /***********************************************************************
133 PureInterpretor
134 ***********************************************************************/
135
136 RichInterpretor::RichInterpretor(Automaton::Ref _dfa)
137 :dfa(_dfa)
138 {
139 datas=new UserData[dfa->states.Count()];
140
141 for(int i=0;i<dfa->states.Count();i++)
142 {
143 State* state=dfa->states[i].Obj();
144 int charEdges=0;
145 int nonCharEdges=0;
146 bool mustSave=false;
147 for(int j=0;j<state->transitions.Count();j++)
148 {
149 if(state->transitions[j]->type==Transition::Chars)
150 {
151 charEdges++;
152 }
153 else
154 {
155 if(state->transitions[j]->type==Transition::Negative ||
156 state->transitions[j]->type==Transition::Positive)
157 {
158 mustSave=true;
159 }
160 nonCharEdges++;
161 }
162 }
163 datas[i].NeedKeepState=mustSave || nonCharEdges>1 || (nonCharEdges!=0 && charEdges!=0);
164 state->userData=&datas[i];
165 }
166 }
167
168 RichInterpretor::~RichInterpretor()
169 {
170 delete[] datas;
171 }
172
173 bool RichInterpretor::MatchHead(const wchar_t* input, const wchar_t* start, Result& result)
174 {
175 List<StateSaver> stateSavers;
176 List<ExtensionSaver> extensionSavers;
177
178 StateSaver currentState;
179 currentState.captureCount=0;
180 currentState.currentState=dfa->startState;
181 currentState.extensionSaverAvailable=-1;
182 currentState.extensionSaverCount=0;
183 currentState.minTransition=0;
184 currentState.reading=input;
185 currentState.stateSaverCount=0;
186 currentState.storeType=StateSaver::Other;
187
188 while(!currentState.currentState->finalState)
189 {
190 bool found=false;
191 StateSaver oldState=currentState;
192 //開始遍歷轉換
193 for(int i=currentState.minTransition;i<currentState.currentState->transitions.Count();i++)
194 {
195 Transition* transition=currentState.currentState->transitions[i];
196 switch(transition->type)
197 {
198 case Transition::Chars:
199 {
200 CharRange range=transition->range;
201 found=
202 range.begin<=*currentState.reading &&
203 range.end>=*currentState.reading;
204 if(found)
205 {
206 currentState.reading++;
207 }
208 }
209 break;
210 case Transition::BeginString:
211 {
212 found=currentState.reading==start;
213 }
214 break;
215 case Transition::EndString:
216 {
217 found=*currentState.reading==L'\0';
218 }
219 break;
220 case Transition::Nop:
221 {
222 found=true;
223 }
224 break;
225 case Transition::Capture:
226 {
227 ExtensionSaver saver;
228 saver.captureListIndex=currentState.captureCount;
229 saver.reading=currentState.reading;
230 saver.transition=transition;
231 Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
232
233 CaptureRecord capture;
234 capture.capture=transition->capture;
235 capture.start=currentState.reading-input;
236 capture.length=-1;
237 PushNonSaver(result.captures, currentState.captureCount, capture);
238
239 found=true;
240 }
241 break;
242 case Transition::Match:
243 {
244 int index=0;
245 for(int j=0;j<currentState.captureCount;j++)
246 {
247 CaptureRecord& capture=result.captures[j];
248 if(capture.capture==transition->capture)
249 {
250 if(capture.length!=-1 && (transition->index==-1 || transition->index==index))
251 {
252 if(wcsncmp(input+capture.start, currentState.reading, capture.length)==0)
253 {
254 currentState.reading+=capture.length;
255 found=true;
256 break;
257 }
258 }
259 if(transition->index!=-1 && index==transition->index)
260 {
261 break;
262 }
263 else
264 {
265 index++;
266 }
267 }
268 }
269 }
270 break;
271 case Transition::Positive:
272 {
273 ExtensionSaver saver;
274 saver.captureListIndex=-1;
275 saver.reading=currentState.reading;
276 saver.transition=transition;
277 Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
278 //Positive的oldState一定會被push
279 oldState.storeType=StateSaver::Positive;
280 found=true;
281 }
282 break;
283 case Transition::Negative:
284 {
285 ExtensionSaver saver;
286 saver.captureListIndex=-1;
287 saver.reading=currentState.reading;
288 saver.transition=transition;
289 Push(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount, saver);
290 //Negative的oldState一定會被push
291 oldState.storeType=StateSaver::Negative;
292 found=true;
293 }
294 break;
295 case Transition::NegativeFail:
296 {
297 //只有在回溯的時候NegativeFail才會被考慮
298 }
299 break;
300 case Transition::End:
301 {
302 ExtensionSaver saver=Pop(extensionSavers, currentState.extensionSaverAvailable, currentState.extensionSaverCount);
303 switch(saver.transition->type)
304 {
305 case Transition::Capture:
306 {
307 CaptureRecord& capture=result.captures[saver.captureListIndex];
308 capture.length=(currentState.reading-input)-capture.start;
309 found=true;
310 }
311 break;
312 case Transition::Positive:
313 for(int j=currentState.stateSaverCount-1;j>=0;j--)
314 {
315 StateSaver& saver=stateSavers[j];
316 if(saver.storeType==StateSaver::Positive)
317 {
318 oldState.reading=saver.reading;
319 oldState.stateSaverCount=j;
320 currentState.reading=saver.reading;
321 currentState.stateSaverCount=j;
322 break;
323 }
324 }
325 found=true;
326 break;
327 case Transition::Negative:
328 for(int j=currentState.stateSaverCount-1;j>=0;j--)
329 {
330 StateSaver& saver=stateSavers[j];
331 if(saver.storeType==StateSaver::Negative)
332 {
333 oldState=saver;
334 oldState.storeType=StateSaver::Other;
335 currentState=saver;
336 currentState.storeType=StateSaver::Other;
337 i=currentState.minTransition-1;
338 break;
339 }
340 }
341 break;
342 }
343 }
344 break;
345 }
346 //尋找成功,在必要的時候保存當前的回溯狀態(tài)
347 if(found)
348 {
349 UserData* data=(UserData*)currentState.currentState->userData;
350 if(data->NeedKeepState)
351 {
352 oldState.minTransition=i+1;
353 PushNonSaver(stateSavers, currentState.stateSaverCount, oldState);
354 }
355 currentState.currentState=transition->target;
356 currentState.minTransition=0;
357 break;
358 }
359 }
360 if(!found)
361 {
362 //存在回溯記錄則回溯
363 if(currentState.stateSaverCount)
364 {
365 //恢復Negative失敗狀態(tài)的時候要移動到NegativeFail后面
366 currentState=PopNonSaver(stateSavers, currentState.stateSaverCount);
367 //minTransition總是被+1后保存,因此直接-1總是有效值
368 if(currentState.currentState->transitions[currentState.minTransition-1]->type==Transition::Negative)
369 {
370 //尋找NegativeFail
371 for(int i=0;i<currentState.currentState->transitions.Count();i++)
372 {
373 Transition* transition=currentState.currentState->transitions[i];
374 if(transition->type==Transition::NegativeFail)
375 {
376 //將當前狀態(tài)移動到NegativeFail后面
377 currentState.currentState=transition->target;
378 currentState.minTransition=0;
379 currentState.storeType=StateSaver::Other;
380 break;
381 }
382 }
383 }
384 }
385 else
386 {
387 break;
388 }
389 }
390 }
391
392 //判斷是否成功并且處理返回結果
393 if(currentState.currentState->finalState)
394 {
395 result.start=0;
396 result.length=currentState.reading-input;
397 for(int i=result.captures.Count()-1;i>=currentState.captureCount;i++)
398 {
399 result.captures.RemoveAt(i);
400 }
401 return true;
402 }
403 else
404 {
405 result.captures.Clear();
406 return false;
407 }
408 }
409
410 bool RichInterpretor::Match(const wchar_t* input, const wchar_t* start, Result& result)
411 {
412 const wchar_t* read=input;
413 while(*read)
414 {
415 if(MatchHead(read, start, result))
416 {
417 result.start=read-input;
418 for(int i=0;i<result.captures.Count();i++)
419 {
420 result.captures[i].start+=result.start;
421 }
422 return true;
423 }
424 read++;
425 }
426 return false;
427 }
428
429 const IReadonlyList<WString>& RichInterpretor::CaptureNames()
430 {
431 return dfa->captureNames.Wrap();
432 }
433 }
434 }
于是帶有各種擴展功能的匹配就完成了。之前還用同樣的前端開發(fā)了一個不帶擴展功能的純匹配的正則表達式引擎。因為過于簡單就不列出來了。這些算法的詳細介紹可以看
這里。
下面的工作就是寫更多的單元測試用例,最后將這兩個不同的匹配算法封裝在一起,智能地在正確的情況下選擇更快的算法。這樣正則表達式引擎的效率將會大大提高。
posted on 2009-11-14 19:13
陳梓瀚(vczh) 閱讀(2469)
評論(1) 編輯 收藏 引用 所屬分類:
VL++3.0開發(fā)紀事