使用Lucene全文检索并使用中文版和高亮显示

使用Lucene全文检索并使用中文版和高亮显示

中文分词需要引入 中文分词发的jar 包,咱们从maven中获取

	<!-- lucene中文分词器 -->
	<dependency>
	    <groupId>org.apache.lucene</groupId>
	    <artifactId>lucene-analyzers-smartcn</artifactId>
	    <version>5.3.1</version>
	</dependency>

下面是分词和索引的事例

	package LuceneTest.LuceneTest;

	import java.nio.file.Paths;
	
	import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.Field.Store;
	import org.apache.lucene.document.IntField;
	import org.apache.lucene.document.StringField;
	import org.apache.lucene.document.TextField;
	import org.apache.lucene.index.IndexWriter;
	import org.apache.lucene.index.IndexWriterConfig;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	import org.junit.Test;
	
	public class IndexChina {

private Directory dir; //存放索引的位置

//准备一下用来测试的数据
private Integer ids[] = {1, 2, 3}; //用来标识文档
private String citys[] = {"上海", "南京", "青岛"};
private String descs[] = {
    "上海是个繁华的城市。",
    "南京是一个有文化的城市。",
    "青岛是一个美丽的城市。"
};

//生成索引
@Test
public void index(String indexDir) throws Exception {   
    dir = FSDirectory.open(Paths.get(indexDir));
    IndexWriter writer = getWriter();
    for(int i = 0; i < ids.length; i++) {
        Document doc = new Document();
        doc.add(new IntField("id", ids[i], Store.YES));
        doc.add(new StringField("city", citys[i], Store.YES));
        doc.add(new TextField("desc", descs[i], Store.YES));
        writer.addDocument(doc); //添加文档
    }
    writer.close(); //close了才真正写到文档中
}

//获取IndexWriter实例
private IndexWriter getWriter() throws Exception {
    SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();//使用中文分词器
    IndexWriterConfig config = new IndexWriterConfig(analyzer); //将标准分词器配到写索引的配置中
    IndexWriter writer = new IndexWriter(dir, config); //实例化写索引对象
    return writer;
}

public static void main(String[] args) throws Exception {
    new IndexChina().index("D:\lucene2");     
}
}

新建的查询

	package LuceneTest.LuceneTest;

	import java.nio.file.Paths;
	
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.queryparser.classic.QueryParser;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	
	public class SearcherChina {

public static void search(String indexDir, String q) throws Exception {

    Directory dir = FSDirectory.open(Paths.get(indexDir)); //获取要查询的路径,也就是索引所在的位置
    IndexReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); //使用中文分词器
    QueryParser parser = new QueryParser("desc", analyzer); //查询解析器
    Query query = parser.parse(q); //通过解析要查询的String,获取查询对象

    long startTime = System.currentTimeMillis(); //记录索引开始时间
    TopDocs docs = searcher.search(query, 10);//开始查询,查询前10条数据,将记录保存在docs中
    long endTime = System.currentTimeMillis(); //记录索引结束时间
    System.out.println("匹配" + q + "共耗时" + (endTime-startTime) + "毫秒");
    System.out.println("查询到" + docs.totalHits + "条记录");

    for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每条查询结果
        Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相当于docID,根据这个docID来获取文档
        System.out.println(doc.get("city")); 
        System.out.println(doc.get("desc")); 
        String desc = doc.get("desc");
    }
    reader.close();
}

public static void main(String[] args) {
    String indexDir = "D:\lucene2";
    String q = "上海繁华"; //查询这个字符
    try {
        search(indexDir, q);
    } catch (Exception e) {
        e.printStackTrace();
    }
}
}

搜索结果的高亮显示

引入jar文件

	 <!-- lucene高亮显示 -->
		<dependency>
		    <groupId>org.apache.lucene</groupId>
		    <artifactId>lucene-highlighter</artifactId>
		    <version>5.3.1</version>
		</dependency>

新建查询并将查询的结果高亮

	package LuceneTest.LuceneTest;

	import java.io.StringReader;
	import java.nio.file.Paths;
	
	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.queryparser.classic.QueryParser;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.search.highlight.Fragmenter;
	import org.apache.lucene.search.highlight.Highlighter;
	import org.apache.lucene.search.highlight.QueryScorer;
	import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
	import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	
	public class SearcherChina {
	
	    public static void search(String indexDir, String q) throws Exception {
	
	        Directory dir = FSDirectory.open(Paths.get(indexDir)); //获取要查询的路径,也就是索引所在的位置
	        IndexReader reader = DirectoryReader.open(dir);
	        IndexSearcher searcher = new IndexSearcher(reader);
	        SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer(); //使用中文分词器
	        QueryParser parser = new QueryParser("desc", analyzer); //查询解析器
	        Query query = parser.parse(q); //通过解析要查询的String,获取查询对象
	
	        long startTime = System.currentTimeMillis(); //记录索引开始时间
	        TopDocs docs = searcher.search(query, 10);//开始查询,查询前10条数据,将记录保存在docs中
	        long endTime = System.currentTimeMillis(); //记录索引结束时间
	        System.out.println("匹配" + q + "共耗时" + (endTime-startTime) + "毫秒");
	        System.out.println("查询到" + docs.totalHits + "条记录");
	
	        
	        //此处加入的是搜索结果的高亮部分
	        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color=red>","</font></b>"); //如果不指定参数的话,默认是加粗,即<b><b/>
	        QueryScorer scorer = new QueryScorer(query);//计算得分,会初始化一个查询结果最高的得分
	        Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); //根据这个得分计算出一个片段
	        Highlighter highlighter = new Highlighter(simpleHTMLFormatter, scorer);
	        highlighter.setTextFragmenter(fragmenter); //设置一下要显示的片段
	
	        
	        
	        for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每条查询结果
	            Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相当于docID,根据这个docID来获取文档
	            System.out.println(doc.get("city")); 
	            System.out.println(doc.get("desc")); 
	            String desc = doc.get("desc");
	            
	            
	          //显示高亮部分
	            if(desc != null) {
	                TokenStream tokenStream = analyzer.tokenStream("desc", new StringReader(desc));
	                String summary = highlighter.getBestFragment(tokenStream, desc);
	                System.out.println(summary);
	            }
	            
	        }
	        
	        
	        
	        reader.close();
	    }
	
	    public static void main(String[] args) {
	        String indexDir = "D:\lucene2";
	        String q = "南京文化"; //查询这个字符
	        try {
	            search(indexDir, q);
	        } catch (Exception e) {
	            e.printStackTrace();
	        }
	    }
	}
原文地址:https://www.cnblogs.com/wangshouchang/p/6869630.html