qiuzhuoxian 2008-01-14
TermVector是Lucene 1.4新增的 它提供一种向量机制来进行模糊查询,TermVector保存Token.getPositionIncrement() 和Token.startOffset() 以及Token.endOffset() 信息.
Field.TermVector.NO:不保存term vectors
Field.TermVector.YES:保存termvectors
Field.TermVector.WITH_POSITIONS:保存termvectors.(保存值和token位置信息)
Field.TermVector.WITH_OFFSETS:保存termvectors.(保存值和Token的offset)
Field.TermVector.WITH_POSITIONS_OFFSETS:保存term vectors.(保存值和token位置信息和Token的offset)下面是个简单的例子:
Analyzer analyzer = new StandardAnalyzer(); RAMDirectory directory = new RAMDirectory(); /** * 创建索引 * * @throws IOException */ public void index() throws IOException{ IndexWriter indexWriter = new IndexWriter(directory,analyzer,true); Document doc1 = new Document(); doc1.add(new Field("title","java",Store.YES,Index.TOKENIZED)); doc1.add(new Field("author","callan",Store.YES,Index.TOKENIZED)); doc1.add(new Field("subject", "java一门编程语言", Store.YES, Index.TOKENIZED,TermVector.WITH_POSITIONS_OFFSETS)); indexWriter.addDocument(doc1); Document doc2 = new Document(); doc2.add(new Field("title","english",Store.YES,Index.TOKENIZED)); doc2.add(new Field("author","wcq",Store.YES,Index.TOKENIZED)); doc2.add(new Field("subject", "英语用的人很多", Store.YES, Index.TOKENIZED,TermVector.WITH_POSITIONS_OFFSETS)); indexWriter.addDocument(doc2); Document doc3 = new Document(); doc3.add(new Field("title","asp",Store.YES,Index.TOKENIZED)); doc3.add(new Field("author","ca",Store.YES,Index.TOKENIZED)); doc3.add(new Field("subject", "asp很多人用", Store.YES, Index.TOKENIZED,TermVector.WITH_POSITIONS_OFFSETS)); indexWriter.addDocument(doc3); indexWriter.optimize(); indexWriter.close(); } // 进行搜索 public void searcher() throws IOException{ IndexSearcher searcher = new IndexSearcher(directory); // 搜索书名为java的索引 TermQuery query = new TermQuery(new Term("title","java")); Hits hits = searcher.search(query); // 能找到一条记录 for(int i = 0; i < hits.length(); i++){ Document doc = hits.doc(i); System.out.println("书名:" + doc.get("title") + " " + "作者:" + doc.get("author") + "简介:" + doc.get("subject")); System.out.println("相关的书:"); docsLike(hits.id(i)); } } // 在subject中模糊搜索与doc相进的索引 public void docsLike(int id) throws IOException { IndexReader reader = IndexReader.open(directory); TermFreqVector vector = reader.getTermFreqVector(id, "subject"); BooleanQuery query = new BooleanQuery(); for (int j = 0; j < vector.size(); j++) { TermQuery tq = new TermQuery(new Term("subject", vector.getTerms()[j])); query.add(tq, BooleanClause.Occur.SHOULD); } IndexSearcher searcher = new IndexSearcher(directory); Hits hits = searcher.search(query); printResult(hits); } // 显示结果 public void printResult(Hits hits) throws IOException{ for(int i = 0; i < hits.length(); i++){ Document d = hits.doc(i); System.out.println("书名:" + d.get("title")+" " + "作者:" + d.get("author") +" " + "简介:" + d.get("subject")); } } public static void main(String[] args) throws IOException { TermFreqVectorTest3 test = new TermFreqVectorTest3(); test.index(); test.searcher(); }
搜索结果:
书名:java作者:callan简介:java一门编程语言
相关的书:
书名:java作者:callan简介:java一门编程语言
书名:english作者:wcq简介:英语用的人很多
搜索书名为java的索引,并且搜索与java的简介相关的索引.
将书<<java>>的subject分词为java/一/门/编/程/语/言/在subject中搜索包含java/一/门/编/程/语/言/的索引
<<english>>包含语