`
刘小小尘
  • 浏览: 62368 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

lucene使用教程7 --lucene实例代码

 
阅读更多

废话不说了,直接上实例代码,如果你看过前面几篇文章,这些代码对你来说都是小case了,理解最重要

下面两个代码是一个工程:

IndexDocument.java

package baseSample;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;

public class IndexDocument {

	public static Directory getIndexDirectory(Directory directory,
			Analyzer analyzer) throws CorruptIndexException,
			LockObtainFailedException, IOException {
		IndexWriter iwriter = new IndexWriter(directory, analyzer, true,
				new IndexWriter.MaxFieldLength(25000));
		// 索引过程的调优
//		iwriter.setMergeFactor(10); // 激励因子
//		iwriter.setMaxMergeDocs(2000); // segment最大文档数量
//		iwriter.setMaxBufferedDocs(1); // 内存文档数量
		// news Fields
		Field newsId = null;
		Field newsName = null;
		Field publishDate = null;
		Field newsSource = null;
		Field newssummay = null;
		// 第1篇新闻
		Document doc1 = new Document();
		newsId = new Field("newsId", "aaaa", Field.Store.YES,
				Field.Index.NOT_ANALYZED);
		newsName = new Field("newsName", "江苏常州曝疫苗造假大案7人被捕超百万人受害",
				Field.Store.YES, Field.Index.ANALYZED);
		publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES,
				Field.Index.NOT_ANALYZED);
		newsSource = new Field("newsSource", "网易新闻频道", Field.Store.YES,
				Field.Index.ANALYZED);
		newssummay = new Field(
				"newssummay",
				"据香港明报报道,江苏常州爆出疫苗造假大案。当地著名疫苗生产商江苏延申生物科技股份有限公司(简称“江苏延申”)被国家药监局查实在疫苗生产过程中长期故意造假,导致大量问题疫苗流向市场,受害者最少超过100万人。",
				Field.Store.YES, Field.Index.ANALYZED);
		doc1.add(newsId);
		doc1.add(newsName);
		doc1.add(publishDate);
		doc1.add(newsSource);
		doc1.add(newssummay);

		iwriter.addDocument(doc1);
		// 第2篇新闻
		Document doc2 = new Document();
		newsId = new Field("newsId", "bbbb", Field.Store.YES,
				Field.Index.NOT_ANALYZED);
		newsName = new Field("newsName", "一月内发生三起坠楼案", Field.Store.YES,
				Field.Index.ANALYZED);
		publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES,
				Field.Index.NOT_ANALYZED);
		newsSource = new Field("newsSource", "广州日报", Field.Store.YES,
				Field.Index.ANALYZED);
		newssummay = new Field("newssummay",
				"昨日凌晨3时左右,科技集团龙华厂区的一名23岁湖南籍男性员工从宿舍楼上坠下,当场死亡",
				Field.Store.YES, Field.Index.ANALYZED);
		doc2.add(newsId);
		doc2.add(newsName);
		doc2.add(publishDate);
		doc2.add(newsSource);
		doc2.add(newssummay);
		iwriter.addDocument(doc2);

		// 第3篇新闻
		Document doc3 = new Document();
		newsId = new Field("newsId", "cccc", Field.Store.YES,
				Field.Index.NOT_ANALYZED);
		newsName = new Field("newsName", "普京称要消灭掉制造地铁爆炸案恐怖分子", Field.Store.YES,
				Field.Index.ANALYZED);
		publishDate = new Field("publishDate", "2010/3/30", Field.Store.YES,
				Field.Index.NOT_ANALYZED);
		newsSource = new Field("newsSource", "网易新闻频道", Field.Store.YES,
				Field.Index.ANALYZED);
		newssummay = new Field("newssummay",
				"据外电报道,俄罗斯总理普京29日表示,当天制造莫斯科地铁连环爆炸案的恐怖分子一定会被抓到,并被消灭掉。",
				Field.Store.YES, Field.Index.ANALYZED);
		doc3.add(newsId);
		doc3.add(newsName);
		doc3.add(publishDate);
		doc3.add(newsSource);
		doc3.add(newssummay);
		// doc3.setBoost(2);
		iwriter.addDocument(doc3);

		// 第4篇新闻
		Document doc4 = new Document();
		newsId = new Field("newsId", "cccc", Field.Store.YES,
				Field.Index.NOT_ANALYZED);
		newsName = new Field("newsName", "最天使", Field.Store.YES,
				Field.Index.ANALYZED);
		publishDate = new Field("publishDate", "2009/3/30", Field.Store.YES,
				Field.Index.NOT_ANALYZED);
		newsSource = new Field("newsSource", "易", Field.Store.YES,
				Field.Index.ANALYZED);
		newssummay = new Field("newssummay", "长肥了", Field.Store.YES,
				Field.Index.ANALYZED);
		doc4.add(newsId);
		doc4.add(newsName);
		doc4.add(publishDate);
		doc4.add(newsSource);
		doc4.add(newssummay);
		iwriter.addDocument(doc4);

		iwriter.close();

		return directory;
	}

}

SampleSearch.java

package baseSample;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;

public class SampleSearch{
	public static void main(String arg[]) throws CorruptIndexException, LockObtainFailedException, IOException, ParseException{
		
		//Store the index in memory:
//		Directory directory  = new RAMDirectory();
		//To store an index on disk, use this instead:
		File file = new File("D:/mapreduce-out/lucenetmp/cache.txt") ;
		if(file.exists()) {
			System.out.println("文件已存在,删除掉");
			file.delete() ;
		}
	    Directory directory = FSDirectory.open(file);
		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
		analyzer = new CJKAnalyzer(Version.LUCENE_30);
		//Now search the index 这一步同时也写入了lucene的cache文件
		IndexSearcher isearcher = new IndexSearcher(IndexDocument.getIndexDirectory(directory, analyzer), true);
		
		/**
		 * IndexSearcher 的主要检索方法   
		 * isearcher.search(Query query, Collector results);
		 * isearcher.search(Query query,int n);
		 * isearcher.search(Query query, Filter filter, Collector results); 
		 */
		//Term 是查询的基本单位
		//1.termQuery
		Query termQuery = new TermQuery(new Term("newsSource","网易"));
		System.out.println("--- termQuery : "+termQuery.toString());
		
		//2.BooleanQuery ,类似还提供RangeQuery范围搜索; PrefixQuery 前缀搜索 ;FuzzyQuery 模糊搜索 ..etc
		Query a = new TermQuery(new Term("newsSource", "网"));
		Query b = new TermQuery(new Term("newsSource", "易"));
		BooleanQuery booleanQuery = new BooleanQuery();
		booleanQuery.add(a, BooleanClause.Occur.MUST);
		booleanQuery.add(b, BooleanClause.Occur.MUST);
		System.out.println("--- booleanQuery :"+ booleanQuery.toString());
		
		//3.用QueryParser 切词出 query
		System.out.println("lucene的当前版本 : " + Version.LUCENE_CURRENT);
		QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "newsSource", analyzer);
		parser.setDefaultOperator(QueryParser.AND_OPERATOR);//默认term之间是or关系
		Query parserQuery = parser.parse("java lucene");
		System.out.println("--- parserQuery : "+parserQuery.toString());
		
		//4.利用MultiFieldQueryParser实现对多Field查询
		String[] fields = {"newsName","newsSource"};
		MultiFieldQueryParser mparser = new MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer);
		Query mQuery = mparser.parse("江苏");
		System.out.println("---- mQuery :"+mQuery);
		
		ScoreDoc[] docs = isearcher.search(termQuery, 10).scoreDocs;
		for (int i = 0; i < docs.length; i++){   
        	System.out.println(docs[i].doc);
        	System.out.println("searcher score :" + docs[i].score);
            Document hitDoc = isearcher.doc(docs[i].doc);
            System.out.println("--- explain : "+isearcher.explain(termQuery, docs[i].doc));
            System.out.println("boost:" + hitDoc.getBoost());
            System.out.println("newsId:" + hitDoc.get("newsId"));
            System.out.println("newsName:" + hitDoc.get("newsName"));
            System.out.println("publishDate:" + hitDoc.get("publishDate"));
            System.out.println("newsSource:" + hitDoc.get("newsSource"));
            System.out.println("newssummay:" + hitDoc.get("newssummay"));
            System.out.println("------------------------------------------");
		 }   
	}

}

下面两个代码,是一起的

TextFileIndexer.java

package lighter.javaeye.com;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class TextFileIndexer {
	public static void main(String[] args) throws IOException {
		//致命要索引文件夹的位置
		File fileDir = new File("D:/mapreduce-out/lucenetmp/demo1") ;
		
		//这里放索引文件的位置
		File indexDir = new File("D:/mapreduce-out/lucenetmp/demo2") ;
		//此处的indexDir应该是放置生成缓存的文件夹
		Directory docx = FSDirectory.open(indexDir);
		Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT) ;
		IndexWriter.MaxFieldLength mf = new MaxFieldLength(100);
		IndexWriter indexWriter = new IndexWriter(docx, luceneAnalyzer, mf) ;
		File[] textFiles = fileDir.listFiles();
		long startTime = new Date().getTime();
		
		for(int i=0;i<textFiles.length;i++) {
			if(textFiles[i].isFile() && textFiles[i].getName().endsWith(".txt")) {
				System.out.println("文件 " + textFiles[i].getCanonicalPath() + "正在呗索引") ;
				String temp = fileReaderAll(textFiles[i].getCanonicalPath(), "GBK") ;
				System.out.println("temp = " + temp);
				Document document = new Document();
				Field fieldPath = new Field("path", textFiles[i].getPath(),Field.Store.YES, Field.Index.NO) ;
				Field fieldBody = new Field("body", temp, Field.Store.YES, Field.Index.ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS) ;
				document.add(fieldPath);
				document.add(fieldBody);
				indexWriter.addDocument(document);
			}
		}
		
		//optimize()方法是对索引进行优化  
		indexWriter.optimize();
		indexWriter.close();
		
		long endTime = new Date().getTime();   
		System.out.println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!" + fileDir.getPath()); 
	}
	
	public static String fileReaderAll(String fileName, String charset) throws IOException {
		BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileName),charset));
		
		String line = new String() ;
		String temp = new String() ;
		while((line = reader.readLine()) != null) {
			temp += line ;
		}
		reader.close();
		return temp ;
	}
}




TestQuery.java

package lighter.javaeye.com;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class TestQuery {
	public static void main(String[] args) throws IOException {
		TopDocs topDoc = null ;
		String queryString = "中华" ;
		Query query = null ;
		Directory directory = FSDirectory.open(new File("D:/mapreduce-out/lucenetmp/demo2"));
		IndexSearcher search = new IndexSearcher(directory) ;
		
		Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
		
		try {
			QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "body", analyzer) ;
			query = qp.parse(queryString);
		} catch (ParseException e) {
			e.printStackTrace() ;
		}
		if(search != null) {
			topDoc = search.search(query, 100);
			if (topDoc.getMaxScore() > 0) {
				System.out.println("topDoc.totalHits" + topDoc.totalHits);
				System.out.println("topDoc.getMaxScore()" + topDoc.getMaxScore());
				System.out.println("topDoc.toString()" + topDoc.toString());
			} else {
				System.out.println("没有查询到结果");
			}
		}
		
	}
}




分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics