|
前一段时间做东西用到了pylucene,包括建立索引,检索,高亮显示等等。贴两段代码,希望对大家有用。
pylucene的安装就不多说了,我用的版本是PyLucene-1.9.1。
建立索引:
#!/usr/bin/env python
import os
import PyLucene
class IndexFiles:
"""
create index by PyLucene, just need your dir path,
the result files saved in the directory index in
current path
"""
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
analyzer = PyLucene.StandardAnalyzer()
store = PyLucene.FSDirectory.getDirectory(storeDir , True)
writer = PyLucene.IndexWriter(store, analyzer, True)
self.indexDocs(root, writer)
print 'optimizing index',
writer.optimize()
writer.close()
print 'done'
def indexDocs(self, root, writer):
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.txt'):
continue
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'gbk')
file.close()
doc = PyLucene.Document()
doc.add(PyLucene.Field.Keyword(u"name", filename.decode('gbk')))
doc.add(PyLucene.Field.Text(u"path", path.decode('gbk')))
if len(contents) > 0:
pass
doc.add(PyLucene.Field.Text(u"contents", contents))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
def indexmain(path):
try:
IndexFiles(path, "index", PyLucene.StandardAnalyzer())
return ''
except Exception, e:
return e
if __name__ == '__main__':
indexpath = raw_input("path: ")
indexmain(indexpath)
检索,高亮显示:
#!/usr/bin/env python
import time
from StringIO import StringIO
from PyLucene import *
class TestFormatter(Formatter):
def __init__(self):
pass
def highlightTerm(self, originalText, group):
if group.getTotalScore() <= 0:
return originalText
return "<font color="red">" + originalText + "</font>"
class Search:
def __init__(self):
STORE_DIR = "index"
self.directory = FSDirectory.getDirectory(STORE_DIR, False)
self.analyzer = ChineseAnalyzer()
self.maxNumFragmentsRequired = 2
self.fragmentSeparator = u"..."
def search(self, query, start):
searcher = IndexSearcher(self.directory)
query = query.decode('gbk')
query = QueryParser.parse(query, "contents", self.analyzer)
starttime = time.time()
hits = searcher.search(query)
formatter = TestFormatter()
highlighter = Highlighter(formatter, QueryScorer(query))
highlighter.setTextFragmenter(SimpleFragmenter(60))
resultdic = {}
totalnum = hits.length()
for i in range(10):
index = start + i
if index >= totalnum:
break
try:
doc = hits.doc(index)
except:
continue
text = doc.get("contents")
tokenStream = self.analyzer.tokenStream("contents", StringIO(text))
result = highlighter.getBestFragments(
tokenStream,
text,
self.maxNumFragmentsRequired,
self.fragmentSeparator)
score = hits.score(index)
if resultdic.has_key(score):
score += 0.0001
resultdic[score] = [result, doc.get("path")]
stoptime = time.time()
usetime = stoptime - starttime
searcher.close()
ks = resultdic.keys()
return resultdic, totalnum, usetime
if __name__ == '__main__':
tt = Search()
command = raw_input("Query:").decode('gbk')
tt.search(command, 0)
处理中文时注意编码
原文摘自:http://hi.baidu.com/uniqcmt/blog/item/fde41631588da7ac5fdf0ed7.html
|
|