运用Luence建索和检索的方法

喜糖 2013-08-24

初学者   还在进步 勿批!

建索:

import java.io.File;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;

import java.util.List;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import com.jpsycn.kfwggl.common.tools.GetRootPath;

import com.jpsycn.kfwggl.common.tools.HandlerSummary;

import com.jpsycn.kfwggl.system.entity.ResultGetInfo;

public class CreateIndex {

//抓取到的页面存放的路径

//String filesPath="F:/kfwlyqTxtList";

//分词

private Analyzer analyzer = new StandardAnalyzer();

public void createIndex(List<ResultGetInfo> lt){

Date d=new Date();

String root=GetRootPath.getIndexesPath();

if(!new File(root).exists()){

new File(root).mkdir();

}

//创建的索引存放路径

String INDEXPATH=root+new SimpleDateFormat("yyyyMMddHHMMSS").format(d)+d.getTime();

System.out.println(INDEXPATH);

if(!new File(INDEXPATH).exists()){

new File(INDEXPATH).mkdir();

}

//获取存放索引的文件夹

try {

SimpleDateFormat ft=new SimpleDateFormat("yyyy-MM-dd");

Directory directory = FSDirectory.getDirectory(INDEXPATH);

IndexWriter indexWriter = new IndexWriter(directory, analyzer ,true, IndexWriter.MaxFieldLength.LIMITED);

long begin = new Date().getTime();

for(ResultGetInfo rg:lt){

//获取一个List<esultGetInfo>遍历里面的值  建索    

//其中 红色titleResult 就是 索引 如字典中的索引  蓝色就是你要建索的字符串 

//Field.Store.YES 表示是否存储 以后可以检索  

//Field.Index.ANALYZED 表示是否分词

Document doc = new Document();

String titleResult=rg.getTitle()==null?"":rg.getTitleResult().trim();

String content =rg.getContent()==null?"":rg.getContent();

String link=rg.getLink()==null?"":rg.getLink().trim();

String releaseDate=rg.getReleaseDate()==null?"":ft.format(rg.getReleaseDate());

doc.add(new Field("titleResult", titleResult, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));

doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));

doc.add(new Field("link", link, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.YES));

doc.add(new Field("releaseDate", releaseDate, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.YES));

indexWriter.addDocument(doc);

}

long end = new Date().getTime();

System.out.println(">>> 1.存入索引完毕.. 共花费:" + (end - begin) +"毫秒...");

indexWriter.optimize();

indexWriter.close();

} catch (Exception e) {

e.printStackTrace();

}

}

}

检索:

import java.io.File;

import java.util.ArrayList;

import java.util.Date;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.queryParser.MultiFieldQueryParser;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

public class GetResultInfo {

public static List<String> getResultInfos(String keyName,String INDEXPATH,String titleOrContent){

List<String> list=new ArrayList<String>();

Analyzer analyzer = new StandardAnalyzer();

//String titleResult="titleResult";

String link="link";

//String content=titleOrContent;

//String releaseDate="releaseDate";

//索引存放位置

try {

IndexSearcher indexSearcher = new IndexSearcher(INDEXPATH);

//System.out.println(">>> 2.开始读取索引... ... 通过关键字:【 "+ keyName +" 】");

BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD };

Query queryOBJ = MultiFieldQueryParser.parse(keyName, new String[]{titleOrContent}, clauses, analyzer);//parser.parse(query);

//Filter filter = null;

//################# 搜索相似度最高的记录 ###################

//TopDocs topDocs = indexSearcher.search(queryOBJ, filter, 1000);

TopDocs topDocs = indexSearcher.search(queryOBJ , 10000);

//System.out.println("*** 共匹配:" + topDocs.totalHits + "个 ***");

//ResultGetInfo rg = null;

//输出结果

for (ScoreDoc scoreDoc : topDocs.scoreDocs){

/*

* 这里我就返回一个List<String>集合 里面存放路径 url

                                 * 这里的link是需要和建索的时候的 link对应的 而且相同 

*/

Document targetDoc = indexSearcher.doc(scoreDoc.doc);

list.add(targetDoc.get(link).trim());

/*rg = new ResultGetInfo();

//注释掉的是关于高亮显示的部分  获取到的是含有html标签的字符串 需要你转换

//设置高亮显示格式

SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'><strong>", "</strong></font>"); 

/* 语法高亮显示设置 */

/*Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(queryOBJ)); 

highlighter.setTextFragmenter(new SimpleFragmenter(100)); 

// 设置高亮 设置 title,content 字段

/*String title = targetDoc.get("titleResult");

String contents = targetDoc.get("content");

TokenStream titleTokenStream = analyzer.tokenStream(titleResult,new StringReader(title));

TokenStream contentTokenStream = analyzer.tokenStream(content,new StringReader(contents));

String highLightTitle = highlighter.getBestFragment(titleTokenStream, title);

String highLightContent = highlighter.getBestFragment(contentTokenStream, contents);

    if(highLightTitle == null){

    highLightTitle = title;

    }

    if(highLightContent == null) {

    highLightContent = content;

    }

   rg.setLink(targetDoc.get(link));

   rg.setTitleResult(highLightTitle);

   rg.setContent(highLightContent);

   rg.setReleaseDate(new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").parse(targetDoc.get(releaseDate)+" 00:00:00"));

list.add(rg);*/

}

indexSearcher.close();

return list;

} catch (Exception e) {

e.printStackTrace();

return null;

}

}

//

public static List<String> getDirPath(String path){

List<String> dirPaths=new ArrayList<String>();

File f=new File(path);

File files[]=f.listFiles();

if(files.length==0){

System.out.println("没有存放索引的文件夹");

}else{

for(int i=0;i<files.length;i++){

//检索每个存放索引的文件夹

dirPaths.add(files[i].getAbsolutePath());

}

}

return dirPaths;

}

public static Map<String,String> getInfos(String path,String str[],String titleOrContent){

long begin = new Date().getTime();

Map<String,String> map=new HashMap<String,String>();

//获取存放索引的所有文件夹

List<String> dirPaths=GetResultInfo.getDirPath(path);

for(int k=0;k<str.length;k++){

for(int i=0;i<dirPaths.size();i++){

List<String> infoList=GetResultInfo.getResultInfos(str[k],dirPaths.get(i),titleOrContent);

for(int j=0;j<infoList.size();j++){

map.put(infoList.get(j),infoList.get(j));

}

}

}

long end = new Date().getTime();

System.out.println(">>> 搜索完毕... ... 共花费:" + (end - begin) +"毫秒...");

System.out.println("一共检索到"+map.size()+"条");

return map;

}

}

这个例子是我对 解析到的网页的 路径、标题、内容、发布日期、来源 进行建索 

然后通过对内容的检索  获取该网页的路径 

相关推荐

lionelf / 0评论 2020-07-28