464872333 发表于 2013-1-26 15:41:36

lucene进行全文检索的一个简单例子

    最近在研究关于lucene检索文档的问题,参考网上一些人的例子,但是结果只能检索英文的,有人说要通过中文分词,但我也用了,结果是一样的,不能检索中文。呵呵。。。后来经过一些高手的指点,解决了中文的问题。我用的lucene版本为3.0.2,中文分词是IKAnalyzer3.2.下面是我的一些代码。仅供参考。
第一步:
 建立文件索引:
   public class IndexProcesser {// 成员变量存储创建的索引文件存放的位置private static String INDEX_STORE_PATH = "G:\\学习\\Lucene相关\\IndexWriter";private static String DATA_DIR = "G:\\学习\\Lucene相关\\IndexWriter\\searchFolder";/*** 索引dataDir下.txt文件,并储存在indexDir下,返回索引的文件数量* * @param indexDir* @param dataDir* @return* @throws Exception*/public static int createIndex(File indexDir, File dataDir) throws Exception {if (!dataDir.exists() || !dataDir.isDirectory()) {throw new IOException(dataDir+ " does not exist or is not a directory");}Analyzer analyzer = new IKAnalyzer();//IK分词器,网上还有别的分词器。IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir),analyzer, true, IndexWriter.MaxFieldLength.LIMITED);writer.setMergeFactor(1000);//合并因子writer.setMaxBufferedDocs(1000); //最大缓存文档数writer.setMaxMergeDocs(Integer.MAX_VALUE); //最大合并文档数writer.setMaxFieldLength(99999999);//增加内存域长度限制indexDirectory(writer, dataDir);int numIndexed = writer.numDocs();writer.optimize();writer.close();return numIndexed;}private static void indexDirectory(IndexWriter writer, File dataDir) {File[] files = dataDir.listFiles();for (int i = 0; i < files.length; i++) {File f = files;if (f.isDirectory()) {indexDirectory(writer, f);} else {try {indexFile(writer, f);} catch (IOException e) {e.printStackTrace();}}}}private static void indexFile(IndexWriter writer, File f)throws IOException {if (f.isHidden() || !f.canRead() || !f.exists()) {return;}System.out.println("indexIng>>" + f.getCanonicalPath());Document doc = new Document();doc.add(new Field("filePath", f.getAbsolutePath(), Field.Store.YES,Field.Index.ANALYZED));doc.add(new Field("content", readFile(f), Field.Store.YES,Field.Index.ANALYZED));writer.addDocument(doc);}//readFile()方法主要对你所要检索的文档进行解析,会用到一些相应的组件,如pdf有pdfBox组件,pdfBox对中文支持不好。我用的是xpdf,关于xpdf的配置,会在后面进行说明。word用到的是POI组件,还有别的格式,此处不在详说。private static String readFile(File f) {StringBuffer content = null;FileInputStream is = null;if (f.getName().endsWith(".doc")) {content=new StringBuffer();try{is=new FileInputStream(f);WordExtractor wordExtractor=new WordExtractor(is);content=content.append(wordExtractor.getText());is.close();}catch(Exception e){e.printStackTrace();}} else if (f.getName().endsWith(".pdf")) {String PATH_TO_XPDF="C:\\xpdftest\\xpdf\\pdftotext.exe";String[] cmd=new String[]{ PATH_TO_XPDF, "-enc", "UTF-8", "-q",f.getAbsoluteFile().toString(), "-" };try {Process p=Runtime.getRuntime().exec(cmd);BufferedInputStream bis=new BufferedInputStream(p.getInputStream());InputStreamReader reader=new InputStreamReader(bis,"UTF-8");int len=0;content=new StringBuffer();while((len=reader.read())!=-1){content.append((char)len);}reader.close();} catch (IOException e) {e.printStackTrace();}}else{try {content = new StringBuffer();is = new FileInputStream(f);BufferedReader br = new BufferedReader(new InputStreamReader(is, "GBK"));for (String line = null; (line = br.readLine()) != null;) {content.append(line).append("\n");}is.close();} catch (Exception e) {e.printStackTrace();}}return content.toString();}public static void main(String[] args) {long start = new Date().getTime();int numIndexed = 0;try {numIndexed = createIndex(new File(INDEX_STORE_PATH), new File(DATA_DIR));} catch (Exception e) {e.printStackTrace();}long end = new Date().getTime();System.out.println("Indexing " + numIndexed + " files took "+ (end - start) + "milliseconds");}} 
//xpdf配置说明:
  1.从http://www.foolabs.com/xpdf/download.html上下载xpdf3.02(xpdf-3.02pl2-win32.zip)和xpdf-chinese-simplified.tar.gz。
  2.将xpdf-3.02pl2-win32.zip解压放入c:/xpdf,同时将xpdf-chinese-simplified.tar.gz解压放入到该文件夹内。
  3.打开解压后的xpdf-chinese-simplified文件夹下的add-to-xpdfrc文件,将其内容拷贝到
xpdfrc.txt中,
 
 
修改如下代码: #----- begin Chinese Simplified support package (2004-jul-27)cidToUnicode Adobe-GB1 C:/xpdf/xpdf-chinese-simplified/Adobe-GB1.cidToUnicodeunicodeMap ISO-2022-CN C:/xpdf/xpdf-chinese-simplified/ISO-2022-CN.unicodeMapunicodeMap EUC-CNC:/xpdf/xpdf-chinese-simplified/EUC-CN.unicodeMapunicodeMap GBKC:/xpdf/xpdf-chinese-simplified/GBK.unicodeMapcMapDirAdobe-GB1 C:/xpdf/xpdf-chinese-simplified/CMaptoUnicodeDir C:/xpdf/xpdf-chinese-simplified/CMapfontDir c:/windows/fontsdisplayCIDFontTT Adobe-GB1 c:/windows/fonts/simhei(truetype)textEOL CR+LF#----- end Chinese Simplified support package 注意“ C:/xpdf”部分路径,要和你本机的路径一致。
下面就是检索了:public class Search {private static String IndexDir="G:\\学习\\Lucene相关\\IndexWriter";private static String keyWord="努力";private static int TOP_NUM = 100;public static void doSearch(File indexDir,String key) throws Exception{IndexSearcher searcher=new IndexSearcher(FSDirectory.open(indexDir),true);String field="content";Query query=IKQueryParser.parse(field, keyWord);//=========================================================================long start=new Date().getTime();TopDocs hits=searcher.search(query, TOP_NUM);long end = new Date().getTime();//end timeSystem.out.println("共找到文档数:"+hits.totalHits); System.out.println("搜索完毕用时:" + (end - start)    + "毫秒");   if(hits.totalHits==0){ System.out.println("没有找到您需要的结果!"); }else{ for(int i=0;i<hits.scoreDocs.length;i++){ try{ ScoreDoc scoreDoc = hits.scoreDocs;// 有变化的地方                      Document doc = searcher.doc(scoreDoc.doc);// 有变化的地方                      System.out.print("这是第" + (i+1) + "个检索结果,文件路径为:");                      System.out.println(doc.get("filePath"));   }catch(Exception e){} } }searcher.close();   }public static void main(String[] args) throws Exception {File indexDir=new File(IndexDir);if(!indexDir.isDirectory()||!indexDir.exists()){throw new Exception(indexDir +      " does not exist or is not a directory。");}doSearch(indexDir, keyWord);}}  
 
页: [1]
查看完整版本: lucene进行全文检索的一个简单例子