Basic use of Lucene and Solr

Lucene

Lucene is a full-text search tool, which provides a complete set of APIs for creating, searching and indexing functions. We can call these APIs in the code to realize our search services.

Inverted index

Lucene implements index function based on inverted file index structure. So what is an inverted index?

There are three documents on the hard disk, the contents of which are as follows:

Document 1: Chinese English Japanese

Document 2: Chinese English Korean

Document 3: English Korean Japanese

We need to find out the two words with Chinese characters. We need to open the files to view and search these two words. The index based on this logic is called forward index, which is obviously inefficient. The logic of inverted index is to record each keyword and the document containing the keyword.

Chinese: document 1 document 2

Korean: document 2 document 3

Japanese: document 1 document 3

In this way, we can search the corresponding documents through keywords. Search "Chinese" to find document 1 and document 2, and search "Japanese" to find document 1 and document 3.

In short, forward index is based on documents to find content, inverted index is based on content to find documents.

The key words in the document need to use the Chinese word segmentation technology. There are many word segmentation devices, and the granularity of word segmentation also affects the accuracy of the search. We will not discuss it here. In this paper, IK word segmentation is used.

The Lucene version used in this article is 4.7.2, and now the latest version has reached 8.5.2. The API will be different, but it is basically the same.

Download address of each version of Lucene: http://archive.apache.org/dist/lucene/java/

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * lucene Version 4.7.2
 * @author administrator
 * @date 2020 June 11, 2006
 */
public class TestLucene {
	
	// Index file directory
	private static String INDEX_PATH =  "F:\\lucene_index";
	
	public static void main(String[] args) {
		//createIndex();
		searchIndex("Bridge");
	}
	
	/**
	 * Create index
	 * IntField FloatField LongField DoubleField StringField These types of fields don't participle, TextField can participle
	 */
	public static void createIndex() {
		Document doc = new Document();
		
		doc.add(new StringField("id", "1", Store.YES));
		TextField titleField = new TextField("title", "Nanjing Yangtze River Bridge", Store.YES);
		
		// This value will be used when scoring the incentive factor, which is equivalent to adding a weight to the Field. The default value is 1.0F
		titleField.setBoost(3.0F);
		
		doc.add(titleField);
		doc.add(new TextField("content", "Once our tour guide took the group to Nanjing. One of the guests asked, "is the mayor of Nanjing called jiangdaqiao?" our tour guide replied, "No."."
				+ "The guest is very strange to say that why when I pass by Nanjing, there is a sign on the side of the road that says "welcome to Nanjing Yangtze River Bridge"?", Store.YES));
		
		try {
			// Open the index file directory stored on the hard disk. There is also a RAMDirectory class, which is stored in memory
			Directory dir = FSDirectory.open(new File(INDEX_PATH));
			
		//	Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
			Analyzer analyzer = new IKAnalyzer();
			
			IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer); 
			IndexWriter idxWriter = new IndexWriter(dir, config);
			idxWriter.addDocument(doc);
			
			idxWriter.commit();
			idxWriter.close();
			System.out.println("Index created successfully");
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * Query index
	 * @param keyword Query keywords
	 */
	public static void searchIndex(String keyword) {
		try {
			Directory dir = FSDirectory.open(new File(INDEX_PATH));
			Analyzer analyzer = new IKAnalyzer();
			
			IndexReader idxReader = DirectoryReader.open(dir);
			
			IndexSearcher idxSearcher = new IndexSearcher(idxReader);
			// Single field query parser
			QueryParser queryParser = new QueryParser(Version.LUCENE_47, "title", analyzer);
			// Multi field query analyzer
			QueryParser multiParser = new MultiFieldQueryParser(Version.LUCENE_47, new String[] {"title","content"}, analyzer);
			// Query object
			Query query = queryParser.parse(keyword);
			
			// TermQuery does not segment queries. Term is the smallest word and cannot be subdivided
			Query termQuery = new TermQuery(new Term("title",keyword));
			// Wildcard query
			Query wildcardQuery = new WildcardQuery(new Term("title",keyword + "*"));
			
			// Boolean query, which can be used for logical combination and combination of several queries
			BooleanQuery booleanQuery = new BooleanQuery();
			booleanQuery.add(termQuery, Occur.MUST);
			booleanQuery.add(wildcardQuery,Occur.MUST_NOT);
			
			// Number of entries per page
			int pageSize = 2;
			// Current page number
			int pageNum = 1;
			// Start number of current page
			int start = (pageNum - 1) * pageSize;
			// End of current page
			int end = pageNum * pageSize;
			
			// Query top end documents
			TopDocs topDocs = idxSearcher.search(query, end);
			System.out.println("A total of" + topDocs.totalHits + "Data");
			
			// This is important because the total number of records can be less than pageNum * pageSize, and the array will be out of bounds when traversing the document
			end = Math.min(end, topDocs.totalHits);
			
			// Highlight format
			Formatter formatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
			Scorer scorer = new QueryScorer(query);
			Highlighter highlighter = new Highlighter(formatter, scorer);
			
			String highlightText = "";
			
			ScoreDoc[] scoreDocs = topDocs.scoreDocs;
			for (int i = start;i < end;i ++) {
				ScoreDoc scoreDoc = scoreDocs[i];
				int docID = scoreDoc.doc;
				Document doc = idxSearcher.doc(docID);
				
				System.out.println("id:" + doc.get("id"));
				System.out.println("title:" + doc.get("title"));
				System.out.println("content:" + doc.get("content"));
				
				highlightText = highlighter.getBestFragment(analyzer, "title", keyword);
				
				System.out.println(highlightText);
				
				System.out.println("score:" + scoreDoc.score);
			}
			
		} catch (IOException e) {
			e.printStackTrace();
		} catch (ParseException e) {
			e.printStackTrace();
		} catch (InvalidTokenOffsetsException e) {
			e.printStackTrace();
		}
	}
	
}

Solr

Solr is an enterprise level search application server, which is based on Lucene and extended to provide a set of its own API, so that we can operate the index in the way of HTTP request.

What are the functions of Solr's directory structure and page, and how to configure them? This article is very clear: https://blog.csdn.net/zimiao552147572/article/details/89962231

Download address of each version of Solr: http://archive.apache.org/dist/lucene/solr/

Solr's client is SorlJ

<!-- https://mvnrepository.com/artifact/org.apache.solr/solr-solrj -->
<dependency>
    <groupId>org.apache.solr</groupId>
    <artifactId>solr-solrj</artifactId>
    <version>4.7.2</version>
</dependency>

import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.ORDER;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;

/**
 * 
 * @author administrator
 * @date 2020 June 12, 2006
 */
public class TestSolrJ {

	// solr service address
	private static final String URL = "http://localhost:8080/solr";

	public static void main(String[] args) {
		
		// Create solr server object
		SolrServer server = new HttpSolrServer(URL);
		
		SolrInputDocument inputDoc = new SolrInputDocument();
		
		inputDoc.setField("id","1");
		inputDoc.setField("title", "Nanjing Yangtze River Bridge", 3.0F);
		inputDoc.setField("content", "Once our tour guide took the group to Nanjing. One of the guests asked, "is the mayor of Nanjing called jiangdaqiao?" our tour guide replied, "No."."
				+ "The guest is very strange to say that why when I pass by Nanjing, there is a sign on the side of the road that says "welcome to Nanjing Yangtze River Bridge"?");
		
		try {
			UpdateResponse resp = server.add(inputDoc);
			System.out.println(resp.getStatus());
			// Submit changes
			server.commit();
			
		} catch (SolrServerException | IOException e) {
			e.printStackTrace();
		}
		
		try {
			
			// Query object
			SolrQuery query = new SolrQuery("title:Bridge AND content:mayor");
			//query.set("q","title: Bridge");
			
			// Number of entries per page
			int pageSize = 2;
			// Current page number
			int pageNum = 1;
			// Start number of current page
			int start = (pageNum - 1) * pageSize;
			// Number of paging start records
			query.setStart(start);
			// Records per page
			query.setRows(pageSize);
			
			// Turn on highlighting
			query.setHighlight(true);
			query.setHighlightRequireFieldMatch(true);
			query.addHighlightField("title");
			query.setHighlightSimplePre("<font color=\"red\">");
			query.setHighlightSimplePost("</font>");
			// Sorted by id in descending order
			query.setSort("id", ORDER.desc);
			
			// Call query method
			QueryResponse response = server.query(query);
			// Get highlighted results
			Map<String, Map<String, List<String>>> highlitMap = response.getHighlighting();

			Set<Entry<String, Map<String, List<String>>>> entrySet = highlitMap.entrySet();
			
			Iterator<Entry<String, Map<String, List<String>>>> ite = entrySet.iterator();
			while (ite.hasNext()) {
				Entry<String, Map<String, List<String>>> entry = ite.next();
				System.out.println("-----" + entry.getKey());
				System.out.println(entry.getValue());
			}
			
			// Number of results found
			long totalHit = response.getResults().getNumFound();
			
			System.out.println("A total of" + totalHit + "Data");
			// list of documents found
			SolrDocumentList docList = response.getResults();
			
			for (int i = 0; i < docList.size(); i++) {
				SolrDocument doc = docList.get(i);
				System.out.println(doc.getFieldValue("title"));
				System.out.println(doc.getFieldValue("content"));
			}
			
			// Delete index
			//server.deleteById("1");
			//server.commit();
			
		} catch (SolrServerException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			server.shutdown();
		}
		
	}
}

Posted by pointsplat on Mon, 15 Jun 2020 01:13:25 -0700

Programmer Group

Basic use of Lucene and Solr

Lucene

Inverted index

Solr

Hot Keywords