1.简介
Lucene是apache软件基金会4 jakarta项目组的一个子项目,是一个开放源代码的全文检索引擎工具包,即它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,
提供了 完整的查询引擎和索引引擎,部分文本分析引擎(英文与德文两种西方语言)。Lucene的目的是为软件开发人员提供一个简单易用的工具包,以方便的在目标系统中实现
全文检索的功能,或者是以此为基础建立起完整的全文检索引擎。
2. 下载
3.测试
在eclipse中邪见项目 Lucence3.0Test,将一下包导入到build 路径中
lucene-core-3.0.2.jar
lucene-demos-3.0.2.jar
lucene-analyzers-3.0.2.jar
lucene-fast-vector-highlighter-3.0.2.jar
lucene-highlighter-3.0.2.jar
lucene-memory-3.0.2.jar
在任意目录下新建两个文件夹:
用来存放lucence进行分词的文件和生成的index。如下图:file1用来存放进行分词的文件,里面存放有N个txt文件,txt文件的内容任意,如:我们都是中国人,index文件夹是新建的,里面不要有任何的文件,是用来存储生成的index的
4. 代码
新建创建索引的类,以及测试
import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.FSDirectory; import org.wltea.analyzer.lucene.IKAnalyzer; public class IndexerOK { private static String INDEX_DIR = "D:\\nutchwork\\LucenceTestDir\\index";// 索引存放目录 private static String DATA_DIR = "D:\\nutchwork\\LucenceTestDir\\file1";// 小文件存放的目录 public static void main(String[] args) throws Exception { long start = new Date().getTime(); int numIndexed = index(new File(INDEX_DIR), new File(DATA_DIR));// 调用index方法 long end = new Date().getTime(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } /** * 索引dataDir下的.txt文件,并储存在indexDir下,返回索引的文件数量 * * @param indexDir * @param dataDir * @return int * @throws IOException */ public static int index(File indexDir, File dataDir) throws IOException { if (!dataDir.exists() || !dataDir.isDirectory()) { throw new IOException(dataDir + " does not exist or is not a directory"); } Analyzer analyzer = new IKAnalyzer();// 采用的分词器 //第三个参数 为true表示新建,false表示添加到原有索引中 IndexWriter writer = new IndexWriter(FSDirectory.open(indexDir), analyzer, true, IndexWriter.MaxFieldLength.LIMITED); indexDirectory(writer, dataDir);// 调用indexDirectory方法 int numIndexed = writer.numDocs(); writer.optimize(); writer.close(); return numIndexed; } /** * 循环遍历目录下的所有.txt文件并进行索引 * * @param writer * @param dir * @throws IOException */ private static void indexDirectory(IndexWriter writer, File dir) throws IOException { File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { indexDirectory(writer, f); // recurse } else if (f.getName().endsWith(".txt")) { indexFile(writer, f); } } } /** * 对单个txt文件进行索引 * * @param writer * @param f * @throws IOException */ private static void indexFile(IndexWriter writer, File f) throws IOException { if (f.isHidden() || !f.exists() || !f.canRead()) { return; } System.out.println("Indexing " + f.getCanonicalPath()); Document doc = new Document(); // doc.add(new Field("contents", new FileReader(f))); doc.add(new Field("filename", f.getCanonicalPath(), Field.Store.YES, Field.Index.ANALYZED)); String temp = FileReaderAll(f.getCanonicalPath(), "GBK"); System.out.println(temp); doc.add(new Field("TTT", temp, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", f.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("modified", DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.ANALYZED)); FileInputStream fis = new FileInputStream(f); // 按照 UTF-8 编码方式将字节流转化为字符流 InputStreamReader isr = new InputStreamReader(fis, "utf-8"); // 从字符流中获取文本并进行缓冲 BufferedReader br = new BufferedReader(isr); doc.add(new Field("contents", br)); writer.setUseCompoundFile(false); writer.addDocument(doc); } public static String FileReaderAll(String FileName, String charset) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(FileName), charset)); String line = new String(); String temp = new String(); while ((line = reader.readLine()) != null) { temp += line; } reader.close(); return temp; } }
运行结果:
Indexing D:\nutchwork\LucenceTestDir\file1\1.txt 我们是中国人 Indexing D:\nutchwork\LucenceTestDir\file1\2.txt 我们是中国人 Indexing D:\nutchwork\LucenceTestDir\file1\3.txt 我们是中国人 Indexing D:\nutchwork\LucenceTestDir\file1\4.txt 我们是中国人 Indexing 4 files took 2293 milliseconds
新建查询的类以及测试:
import java.io.File; import java.io.StringReader; import java.util.Date; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.FSDirectory; import org.wltea.analyzer.lucene.IKAnalyzer; import org.wltea.analyzer.lucene.IKQueryParser; import org.wltea.analyzer.lucene.IKSimilarity; public class SearchQueryOK { private static String INDEX_DIR = "D:\\nutchwork\\LucenceTestDir\\index";// 索引所在的路径 private static String KEYWORD = "中国人";// 关键词 private static int TOP_NUM = 100;// 显示前100条结果 public static void main(String[] args) throws Exception { File indexDir = new File(INDEX_DIR); if (!indexDir.exists() || !indexDir.isDirectory()) { throw new Exception(indexDir + " does not exist or is not a directory."); } search(indexDir, KEYWORD);// 调用search方法进行查询 } /** * 查询 * * @param indexDir * @param q * @throws Exception */ public static void search(File indexDir, String q) throws Exception { IndexSearcher is = new IndexSearcher(FSDirectory.open(indexDir), true);// read-only String[] field = { "TTT","modified","filename"}; long start = new Date().getTime();// start time // 高亮设置 Analyzer analyzer = new IKAnalyzer();// 设定分词器 Query query2 = IKQueryParser.parseMultiField(field, KEYWORD); // 实例化搜索器 IndexSearcher isearcher1 = new IndexSearcher(FSDirectory.open(indexDir)); // 在索引器中使用IKSimilarity相似度评估器 isearcher1.setSimilarity(new IKSimilarity()); Sort sort = new Sort(new SortField("path", SortField.DOC,false)); //TermQuery q1 = new TermQuery(new Term("filename", "1")); // 搜索相似度最高的记录 TopDocs topDocs1 = isearcher1.search(query2,null, TOP_NUM,sort); ScoreDoc[] hits3 = topDocs1.scoreDocs; SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( "", "");// 设定高亮显示的格式,也就是对高亮显示的词组加上前缀后缀 Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query2)); for (int i = 0; i < hits3.length; i++) { Document doc = is.doc(hits3[i].doc); String docTTT = doc.get("TTT"); highlighter.setTextFragmenter(new SimpleFragmenter(docTTT.length()));// 设置每次返回的字符数.想必大家在使用搜索引擎的时候也没有一并把全部数据展示出来吧,当然这里也是设定只展示部分数据 TokenStream tokenStream = analyzer.tokenStream("", new StringReader(docTTT)); String str = highlighter.getBestFragment(tokenStream, docTTT); System.out.println(" 高亮设置: " + str ); String docModified = doc.get("filename"); highlighter.setTextFragmenter(new SimpleFragmenter(docModified.length())); TokenStream tokenStream2 = analyzer.tokenStream("", new StringReader(docModified)); String str2 = highlighter.getBestFragment(tokenStream2, docModified); System.out.println(" 高亮设置: " + str2 ); Listlist = doc.getFields(); for (int j = 0; j < list.size(); j++) { Fieldable fieldable = list.get(j); System.out.println(fieldable.name() + " : " + fieldable.stringValue() + " "); } } long end = new Date().getTime();// end time System.out.println("Found " + hits3.length + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':"); } }
对索引的操作类:
import java.io.File; import java.io.IOException; import java.sql.Connection; import java.sql.SQLException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; public class ManageIndexFile { private static String INDEX_DIR = "D:\\nutchwork\\LucenceTestDir\\index";// 索引存放目录 //删除索引 public static void DeleteIndex(SearchDocBean bean) throws IOException { Directory dir = FSDirectory.open(new File(INDEX_DIR)); IndexReader reader = IndexReader.open(dir, false); Term term = new Term("modified", bean.getId()); int count = reader.deleteDocuments(term); reader.close(); System.out.println("Successful Delete " + count + " path==" + bean.getId()); } public static void DeleteIndex(int[] posIDS) throws IOException { Directory dir = FSDirectory.open(new File(INDEX_DIR)); IndexReader reader = IndexReader.open(dir, false); for (int i = 0; i < posIDS.length; i++) { Term term = new Term("posID", Integer.toString(posIDS[i])); reader.deleteDocuments(term); } reader.close(); } //更新索引 public static void UpdateIndex(SearchDocBean bean) throws IOException { Directory dir = FSDirectory.open(new File(INDEX_DIR)); IndexReader reader = IndexReader.open(dir, false); Term term = new Term("modified", bean.getId()); reader.deleteDocuments(term); reader.close(); IndexWriter writer = new IndexWriter(FSDirectory.open(new File( INDEX_DIR)), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.add(new Field("modified", bean.getId(), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); writer.optimize(); writer.close(); } //增加索引 public static void AddIndex(SearchDocBean bean, Connection conn) throws IOException, SQLException { Analyzer analyzer = new IKAnalyzer();// 采用的分词器 IndexWriter writer = new IndexWriter(FSDirectory.open(new File( INDEX_DIR)), analyzer, false, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.add(new Field("filename", bean.getFileName(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("path", bean.getPath(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("dateTime", bean.getId(), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("TTT", bean.getContents(), Field.Store.YES, Field.Index.ANALYZED)); writer.setUseCompoundFile(false); writer.addDocument(doc); writer.optimize(); writer.close(); } }
封装起来的查询结果:
public class SearchDocBean { private String id; private String path; private String contents; private String dateTime; public String getId() { return id; } public void setId(String id) { this.id = id; } public String getPath() { return path; } public void setPath(String path) { this.path = path; } public String getContents() { return contents; } public void setContents(String contents) { this.contents = contents; } public String getDateTime() { return dateTime; } public void setDateTime(String dateTime) { this.dateTime = dateTime; } public String getFileName() { return fileName; } public void setFileName(String fileName) { this.fileName = fileName; } private String fileName; }
下面是serach 中国人的 结果:
高亮设置: 中国人 高亮设置: null filename : D:\nutchwork\LucenceTestDir\file1\1.txt TTT : 我们是中国人 path : D:\nutchwork\LucenceTestDir\file1\1.txt modified : 201107161115 高亮设置: 中国人 高亮设置: null filename : D:\nutchwork\LucenceTestDir\file1\2.txt TTT : 我们是中国人 path : D:\nutchwork\LucenceTestDir\file1\2.txt modified : 201107161115 高亮设置: 中国人 高亮设置: null filename : D:\nutchwork\LucenceTestDir\file1\3.txt TTT : 我们是中国人 path : D:\nutchwork\LucenceTestDir\file1\3.txt modified : 201107161115 高亮设置: 中国人 高亮设置: null filename : D:\nutchwork\LucenceTestDir\file1\4.txt TTT : 我们是中国人 path : D:\nutchwork\LucenceTestDir\file1\4.txt modified : 201107161115 Found 4 document(s) (in 717 milliseconds) that matched query '中国人':
整个工程:基本上是从网上找到的代码,运行了下,算是有一个大概的了解。