qiuzhuoxian 2011-03-09
最近在研究关于lucene检索文档的问题,参考网上一些人的例子,但是结果只能检索英文的,有人说要通过中文分词,但我也用了,结果是一样的,不能检索中文。呵呵。。。后来经过一些高手的指点,解决了中文的问题。我用的lucene版本为3.0.2,中文分词是IKAnalyzer3.2.下面是我的一些代码。仅供参考。
第一步:
建立文件索引:public class IndexProcesser { // 成员变量存储创建的索引文件存放的位置 private static String INDEX_STORE_PATH = "G:\\学习\\Lucene相关\\IndexWriter"; private static String DATA_DIR = "G:\\学习\\Lucene相关\\IndexWriter\\searchFolder"; /** * 索引dataDir下.txt文件,并储存在indexDir下,返回索引的文件数量 * * @param indexDir * @param dataDir * @return * @throws Exception */ public static int createIndex(File indexDir, File dataDir) throws Exception { if (!dataDir.exists() || !dataDir.isDirectory()) { throw new IOException(dataDir + " does not exist or is not a directory"); } Analyzer analyzer = new IKAnalyzer();//IK分词器,网上还有别的分词器。 IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
writer.setMergeFactor(1000); //合并因子 writer.setMaxBufferedDocs(1000); //最大缓存文档数 writer.setMaxMergeDocs(Integer.MAX_VALUE); //最大合并文档数 writer.setMaxFieldLength(99999999);//增加内存域长度限制 indexDirectory(writer, dataDir); int numIndexed = writer.numDocs(); writer.optimize(); writer.close(); return numIndexed; } private static void indexDirectory(IndexWriter writer, File dataDir) { File[] files = dataDir.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { indexDirectory(writer, f); } else { try { indexFile(writer, f); } catch (IOException e) { e.printStackTrace(); } } } } private static void indexFile(IndexWriter writer, File f) throws IOException { if (f.isHidden() || !f.canRead() || !f.exists()) { return; } System.out.println("indexIng>>" + f.getCanonicalPath()); Document doc = new Document(); doc.add(new Field("filePath", f.getAbsolutePath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("content", readFile(f), Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); } //readFile()方法主要对你所要检索的文档进行解析,会用到一些相应的组件,如pdf有pdfBox组件,pdfBox对中文支持不好。我用的是xpdf,关于xpdf的配置,会在后面进行说明。word用到的是POI组件,还有别的格式,此处不在详说。 private static String readFile(File f) { StringBuffer content = null; FileInputStream is = null; if (f.getName().endsWith(".doc")) { content=new StringBuffer(); try{ is=new FileInputStream(f); WordExtractor wordExtractor=new WordExtractor(is); content=content.append(wordExtractor.getText()); is.close(); }catch(Exception e){ e.printStackTrace(); } } else if (f.getName().endsWith(".pdf")) { String PATH_TO_XPDF="C:\\xpdftest\\xpdf\\pdftotext.exe"; String[] cmd=new String[]{ PATH_TO_XPDF, "-enc", "UTF-8", "-q",f.getAbsoluteFile().toString(), "-" }; try { Process p=Runtime.getRuntime().exec(cmd); BufferedInputStream bis=new BufferedInputStream(p.getInputStream()); InputStreamReader reader=new InputStreamReader(bis,"UTF-8"); int len=0; content=new StringBuffer(); while((len=reader.read())!=-1){ content.append((char)len); } reader.close(); } catch (IOException e) { e.printStackTrace(); } }else{ try { content = new StringBuffer(); is = new FileInputStream(f); BufferedReader br = new BufferedReader(new InputStreamReader( is, "GBK")); for (String line = null; (line = br.readLine()) != null;) { content.append(line).append("\n"); } is.close(); } catch (Exception e) { e.printStackTrace(); } } return content.toString(); } public static void main(String[] args) { long start = new Date().getTime(); int numIndexed = 0; try { numIndexed = createIndex(new File(INDEX_STORE_PATH), new File(DATA_DIR)); } catch (Exception e) { e.printStackTrace(); } long end = new Date().getTime(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } }
//xpdf配置说明:
1.从http://www.foolabs.com/xpdf/download.html上下载xpdf3.02(xpdf-3.02pl2-win32.zip)和xpdf-chinese-simplified.tar.gz。
2.将xpdf-3.02pl2-win32.zip解压放入c:/xpdf,同时将xpdf-chinese-simplified.tar.gz解压放入到该文件夹内。
3.打开解压后的xpdf-chinese-simplified文件夹下的add-to-xpdfrc文件,将其内容拷贝到
xpdfrc.txt中,修改如下代码: #----- begin Chinese Simplified support package (2004-jul-27) cidToUnicode Adobe-GB1 C:/xpdf/xpdf-chinese-simplified/Adobe-GB1.cidToUnicode unicodeMap ISO-2022-CN C:/xpdf/xpdf-chinese-simplified/ISO-2022-CN.unicodeMap unicodeMap EUC-CN C:/xpdf/xpdf-chinese-simplified/EUC-CN.unicodeMap unicodeMap GBK C:/xpdf/xpdf-chinese-simplified/GBK.unicodeMap cMapDir Adobe-GB1 C:/xpdf/xpdf-chinese-simplified/CMap toUnicodeDir C:/xpdf/xpdf-chinese-simplified/CMap fontDir c:/windows/fonts displayCIDFontTT Adobe-GB1 c:/windows/fonts/simhei(truetype) textEOL CR+LF #----- end Chinese Simplified support package
注意“ C:/xpdf”部分路径,要和你本机的路径一致。
下面就是检索了: public class Search { private static String IndexDir="G:\\学习\\Lucene相关\\IndexWriter"; private static String keyWord="努力"; private static int TOP_NUM = 100; public static void doSearch(File indexDir,String key) throws Exception{ IndexSearcher searcher=new IndexSearcher(FSDirectory.open(indexDir),true); String field="content"; Query query=IKQueryParser.parse(field, keyWord); //========================================================================= long start=new Date().getTime(); TopDocs hits=searcher.search(query, TOP_NUM); long end = new Date().getTime();//end time System.out.println("共找到文档数:"+hits.totalHits); System.out.println("搜索完毕用时:" + (end - start) + "毫秒"); if(hits.totalHits==0){ System.out.println("没有找到您需要的结果!"); }else{ for(int i=0;i<hits.scoreDocs.length;i++){ try{ ScoreDoc scoreDoc = hits.scoreDocs[i];// 有变化的地方 Document doc = searcher.doc(scoreDoc.doc);// 有变化的地方 System.out.print("这是第" + (i+1) + "个检索结果,文件路径为:"); System.out.println(doc.get("filePath")); }catch(Exception e){ } } } searcher.close(); } public static void main(String[] args) throws Exception { File indexDir=new File(IndexDir); if(!indexDir.isDirectory()||!indexDir.exists()){ throw new Exception(indexDir + " does not exist or is not a directory。"); } doSearch(indexDir, keyWord); } }