是的,這兩天我在玩Python!
為了給LAC添加內置的詞典, 需要將通過Lingoes-Extractor解出的數據導入到LAC使用的Sqlite中. 這個過程并不復雜 -- 解開ld2文件數據輸出每條記錄到一個文本文件中,然后處理每一條記錄導入到Sqlite中即可,要是用C++來實現,也就兩個晚上的事情,但,但這次我又蛋疼了...
大約一個月前,工作中碰到一段Python腳本,對于我這樣習慣C++的人來說,看類似Python腳本跟天書差不多了...雖然上半年看了幾天Perl,但到今天也只記得名字了...
于是這次'痛定思痛',決定搞起一個腳本語言來.于是就有了這次蛋疼的事情 -- 用Python實現LD2到Sqlite的導入.
不多說,有興趣的直接看Python腳本吧...

htmlparser.py
# !/usr/bin/python
# coding:utf-8
import string
from HTMLParser import HTMLParser
class MyParser(HTMLParser):
result = 0
levelField = -1
levelInfo = -1
flag = -1
#str = 'abandon = <C><E>abandons|abandoned|abandoning</E><F><H><M>a·ban·don || ?\'bænd?n</M></H><I><N><U>n.</U> 放縱, 放任; 狂熱</N></I><I><N><U>v.</U> 丟棄; 中止, 放棄; 遺棄, 拋棄; 使放縱</N></I></F></C>'
def handle_starttag(self, tag, attrs):
if tag == 'c':
self.flag = 0 #content
elif tag == 'e':
self.flag = 1 #extend
elif tag == 'f':
self.result.field.append(DictField())
self.levelField += 1
# self.levelInfo = -1
# print 'levelField =', self.levelField
self.flag = 2 #field
elif tag == 'l':
self.flag = 3 #link
elif tag == 'm':
self.flag = 4 #symbol
elif tag == 'i':
self.result.field[self.levelField].info.append(DictInfo())
self.levelInfo += 1
# print 'info == levelField = %s levelInfo = %s' % (self.levelField, self.levelInfo)
self.flag = 5 #info
elif tag == 'n':
self.flag = 6 #meaning
elif tag == 'u':
self.flag = 7 #category
def handle_endtag(self, tag):
if tag == 'u':
self.flag = 6 #meaning
def handle_data(self, data):
index = self.levelField
if self.flag == 1:
self.result.extend.append(data)
elif self.flag == 3:
self.result.field[self.levelField].link = data
elif self.flag == 4:
self.result.field[self.levelField].symbol = data
elif self.flag == 6:
# print 'meaning == index = %d' % index
self.result.field[self.levelField].info[index * 5 + self.levelInfo].meaning = data
# print 'meaning == levelField=%d levelInfo=%d' % (self.levelField, self.levelInfo)
# print 'meaning == info: %s' % self.result.field[self.levelField].info[self.levelInfo]
elif self.flag == 7:
# print 'category == index = %d' % index
self.result.field[self.levelField].info[index * 5 + self.levelInfo].category = data
# print 'category == levelField=%d levelInfo=%d' % (self.levelField, self.levelInfo)
# print 'category == info: %s' % self.result.field[self.levelField].info[self.levelInfo]
def parse(self, html, data):
# self.levelField = -1
# self.levelInfo = -1
# self.flag = -1
self.result = data
self.feed(html)
class DictInfo:
category = ''
meaning = ''
def __str__(self):
return '[category = %s meaning = %s]' % (self.category, self.meaning)
class DictField:
symbol = ''
link = ''
info = [DictInfo() for i in range(0,25)]
def __str__(self):
return '[symbol = %s | link = %s info = %s]' % (self.symbol, self.link, string.join(map(str, self.info)))
class DictData:
word = ''
extend = [] #stringlist
field = []
def __str__(self):
return 'word = %s extend = %s field = %s' % (self.word, string.join(map(str, self.extend)), string.join(map(str, self.field)))
def parseHtml(html, output):
parser = MyParser()
parser.parse(html, output)
parser.close()
def analyseLine(str, output):
pos = str.find(' =')
output.word = str[:pos]
html = str[pos + 3 :]
# print 'html=', html
parseHtml(html, output)
這個實現了從解開后的ld2記錄到內部數據的解析;

data2xml.py
import string
#<X>
#<D>dictid</D>
#<E>E1</E>
#<E>E2</E>
#<F>
#<S>Symbol</S>
#<L>Link</L>
#<I>
#<C>category</C>
#<M>Meaning</M>
#</I>
#<I>
#<C>category</C>
#<M>Meaning</M>
#</I>
#</F>
#<F>
#<S>Symbol</S>
#<L>Link</L>
#<I>
#<C>category</C>
#<M>Meaning</M>
#</I>
#</F>
#</X>
def addtag(list, stag, etag):
if len(list) > 0:
ret = ''
for data in list:
ret = stag + string.strip(data, ' ') + etag
return ret
else:
return ''
def addExtend(extend):
return addtag(extend, '<e>', '</e>')
def addInfo(info, index):
if len(info) > 0:
ret = ''
for i in info[index * 5:(index + 1) * 5]:
if i.category == '' and i.meaning == '':
break
ret += '<i>'
if i.category != '':
ret += '<c>' + string.strip(i.category, ' ') + '</c>'
if i.meaning != '':
ret += '<m>' + string.strip(i.meaning, ' ') + '</m>'
ret += '</i>'
return ret
else:
return ''
def addSubField(f, index):
ret = ''
if f.symbol != '':
ret += '<s>' + string.strip(f.symbol, ' ') + '</s>'
if f.link != '':
ret += '<l>' + string.strip(f.link, ' ') + '/l>'
ret += addInfo(f.info, index)
return ret
def addField(field):
index = 0
if len(field) > 0:
ret = '<f>'
for f in field:
ret += addSubField(f, index)
index += 1
ret += '</f>'
return ret
else:
return ''
def data2xml(data):
ret = '<x>' \
+ '<d>1</d>' \
+ addExtend(data.extend) \
+ addField(data.field) \
+ "</x>"
return ret
這個實現了從內部數據到指定xml字串的處理;(蛋疼啊,ld2本來的數據也是xml格式的...但為了展現我'高超'的python功底,自己又定義了一次格式...)

dbaccess.py
#!/usr/bin/python
# coding:utf-8
import sqlite3 as sqlite
import re
def table_create(conn):
cursor = conn.cursor()
sql = [
'CREATE TABLE IF NOT EXISTS Word (wordid INTEGER PRIMARY KEY, word TEXT, flag INTEGER)',
'CREATE TABLE IF NOT EXISTS Src (srcid INTEGER PRIMARY KEY, wordid INTEGER, fmt INTEGER, orig INTEGER, content TEXT)',
'CREATE TABLE IF NOT EXISTS Dict (dictid INTEGER PRIMARY KEY, title TEXT)'
]
for s in sql:
cursor.execute(s)
def add_dict(conn, title):
cursor = conn.cursor()
cursor.execute('INSERT INTO Dict (title) VALUES (\'%s\')' % title)
conn.commit()
def add_record(conn, word, record):
cursor = conn.cursor()
# sql.encode('string_scape')
cursor.execute('INSERT INTO Word (word, flag) VALUES ("%s",1)' % (word))#'INSERT INTO Word (word, flag) VALUES (\'%s\',1)' % (word))
record = record.replace('\"', '')
cursor.execute('INSERT INTO Src (wordid, fmt, orig, content) VALUES (%d, 3, 1, "%s")' % (cursor.lastrowid, record))
# conn.commit()
def db_create(dbfile):
return sqlite.connect(dbfile)
def db_close(conn):
conn.commit();
conn.close()
def db_test(conn):
cursor = conn.cursor()
record = '"1234"'
record.replace('\"', '')
cursor.execute('INSERT INTO Word (word, flag) VALUES ("%s", 1)' % (record))
conn.commit()
#######################################3
#conn = db_create('../data/lac.db')
#add_dict(conn, 'test')
#db_close(conn)
這個實現了相關的數據庫功能,包括主要的創建,寫入等;
# !/usr/bin/python
# coding:utf-8
import string
import htmlparser
import data2xml
import dbaccess
def main():
# str = 'test = <c><E>1</E><E>2</E>'
file = open("../data/output.txt", "r")
conn = dbaccess.db_create("../data/lac.db3")
dbaccess.table_create(conn)
# dbaccess.db_test(conn)
# return
dbaccess.add_dict(conn, 'Vicon English-Chinese(S) Dictionary')
i = 0
for line in file:
# print line
data = htmlparser.DictData()
htmlparser.analyseLine(string.rstrip(line, '\n'), data)
# print 'data ===== ', data
# print data2xml.data2xml(data)
dbaccess.add_record(conn, data.word, data2xml.data2xml(data))
dbaccess.db_close(conn)
file.close()
main()
這個就是main入口了...
如何?俺寫的Python腳本如何?要是俺跟你說,一個月前,我連Python都會拼錯,現在卻可以寫出如此'長'的Python腳本來了...你是覺得我很猛,還是覺得Python佷簡單呢...
雖然在編寫Python腳本的時候,碰到了各種郁悶錯誤,各種坑爹的用法,但我還是滿喜歡Python的,
總比使用由那位獲得兩屆IOCCC大獎的家伙創建的Perl的感覺爽了很多很多啊....
<---- 松口氣的分割線 ---->
昨晚終于搞定了這最終的Python腳本,但測試的結果不是佷滿意. 生成LAC的sqlite數據需要將近3個小時不說,這超過80MB的數據文件更加讓人崩潰...不過總的來說,終于可以先暫時放下這個數據導入的問題,繼續編寫LAC了...
這個月工作上有些變故,也一直難以靜心敲字,到上周也算到過階段了...終于可以放松下了...