Lucene note25 - use of Lucene - scoring settings by domain

I. demand

Set scoring rules according to the file name, or double the score of the latest year according to the modification time of the document, reduce the score beyond one year, etc.

II. Specific implementation

Here, the scoring rule is modified according to the file name. The scores containing "JRE" and "SYSTEM" in the file name are doubled, and the rest are multiplied. The key is how to get the file name. In the customScore() method, there is a doc variable. We can get the document through this doc to get the file name. Another point is that Lucene has a domain cache. As long as the IndexReader is not closed, all data will be stored in the domain cache. We can use this feature to extract the contents of the file name domain.

package com.wsy;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.function.CustomScoreProvider;
import org.apache.lucene.search.function.CustomScoreQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Random;

public class MyScoreQuery {
    private static Directory directory;
    private static IndexReader indexReader;

    static {
        try {
            directory = FSDirectory.open(new File("E:\\Lucene\\IndexLibrary"));
            indexReader = IndexReader.open(directory);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void index(boolean update) {
        IndexWriter indexWriter = null;
        try {
            indexWriter = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)));
            if (update) {
                indexWriter.deleteAll();
            }
            File[] files = new File("E:\\Lucene\\SearchSource").listFiles();
            Random random = new Random();
            for (File file : files) {
                int score = random.nextInt(100);
                Document document = new Document();
                document.add(new Field("content", new FileReader(file)));
                document.add(new Field("fileName", file.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.add(new Field("path", file.getAbsolutePath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.add(new NumericField("date", Field.Store.YES, true).setLongValue(file.lastModified()));
                document.add(new NumericField("size", Field.Store.YES, true).setIntValue((int) (file.length())));
                document.add(new NumericField("score", Field.Store.YES, true).setIntValue(score));
                indexWriter.addDocument(document);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (indexWriter != null) {
                try {
                    indexWriter.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    public void searchByFileScoreQuery() {
        try {
            IndexSearcher indexSearcher = new IndexSearcher(indexReader);
            Query query = new TermQuery(new Term("content", "java"));
            // Create a custom Query object based on the rating field and the original Query
            FileNameScoreQuery fileNameScoreQuery = new FileNameScoreQuery(query);
            TopDocs topDocs = indexSearcher.search(fileNameScoreQuery, 100);
            ScoreDoc[] scoreDocs = topDocs.scoreDocs;
            for (ScoreDoc scoreDoc : scoreDocs) {
                Document document = indexSearcher.doc(scoreDoc.doc);
                System.out.println(scoreDoc.doc + "-->" + scoreDoc.score + "-->" + document.get("fileName") + "-->" + document.get("score") + "-->" + document.get("size"));
            }
            indexSearcher.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private class FileNameScoreQuery extends CustomScoreQuery {
        public FileNameScoreQuery(Query subQuery) {
            super(subQuery);
        }

        @Override
        protected CustomScoreProvider getCustomScoreProvider(IndexReader reader) throws IOException {
            // return super.getCustomScoreProvider(reader);
            return new FileNameScoreProvider(indexReader);
        }
    }

    private class FileNameScoreProvider extends CustomScoreProvider {
        String[] fileNames = null;

        public FileNameScoreProvider(IndexReader reader) {
            super(reader);
            try {
                // Get the data from the domain cache. Here, get the information with the domain value of fileName
                fileNames = FieldCache.DEFAULT.getStrings(indexReader, "fileName");
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        @Override
        public float customScore(int doc, float subQueryScore, float valSrcScore) throws IOException {
            // Before IndexReader is closed, all data will be stored in the domain cache, and useful information can be obtained through the domain cache
            // Advantages: fast; disadvantages: it will occupy a lot of memory
            // Get the value of fileName according to doc
            String fileName = fileNames[doc];
            if (fileName.contains("JRE") || fileName.contains("SYSTEM")) {
                return 10 * subQueryScore;
            }
            return subQueryScore / 10;
            // return super.customScore(doc, subQueryScore, valSrcScore);
        }
    }

    public static void main(String[] args) {
        MyScoreQuery myScoreQuery = new MyScoreQuery();
        myScoreQuery.index(true);
        myScoreQuery.searchByFileScoreQuery();
    }
}

Posted by darkshine on Thu, 12 Dec 2019 08:55:10 -0800

Programmer Group

Lucene note25 - use of Lucene - scoring settings by domain

I. demand

II. Specific implementation

Hot Keywords