Description: an English text search engine, currently supports Boolean query, phrase query, free text query.
C language implementation only uses some common data structures, such as binary tree, hash table, etc. the following document is the lecture report (not as a tutorial), which may not be very clear. It is suggested to use it as a reference~
Author: a dish of chicken in Huake soft hospital
Reference: introduction to information retrieval (people's post and Telecommunications Press)
English text search engine development document
Document material
Ten txt files in the static directory, from 0.txt to 9.txt, are from BBC English reports.
Program flow
- Build inverted index: use hash table to store word items, inverted table, document frequency, word frequency and other information
- Provide retrieval services
- Boolean query
- Phrase query: build inverted index with location information
- Free text query: sorting retrieval results based on vector space model
Build inverted index - hash table
-
Read the entries of the document in turn. If an entry already exists in the inverted index, only update its information. If it does not exist, insert and initialize the information
-
Repeat 1 until all documents are traversed and inverted index is established
/* * Build inverted index * * return: Built inverted index (hash table) * */ HashTable BuildIndex() { int length, wordCounts = 0, TableSize = 3000, wordOrder = 1; char *word = NULL; FILE *fp; char fileID[10], buffer[100]; Position p; HashTable HT = InitHashTable(TableSize); //Initializing a hash table for (int i = 0; i < IDLength; i++) { //String of the file path at the splice, such as ".. / static/0.txt" char txt[50] = "../static/"; itoa(fileIDs[i], fileID, 10); strcat(txt, fileID); strcat(txt, ".txt"); wordOrder = 1; if ((fp = fopen(txt, "r")) == NULL) { //Failed to open file successfully printf("Open failed: %s\n", txt); exit(0); } while (!feof(fp)) { length = ReadAWord(fp, &word); //Read a word from text if (length >= 3) { //If the read string is longer than or equal to 3, we treat it as a word wordCounts++; //Words read plus 1 InsertOrCount(HT, word, fileIDs[i], wordOrder++); //If the word appears for the first time, insert it into the table; if it already exists in the table, add one word frequency. wordOrder is used to record where the entry appears } else { //Otherwise, abandon it and release space free(word); } } fclose(fp); } return HT; } /* * Insert the word item into the hash table or increase the corresponding record * * HT: Inserted hash table * E: Inserted term * docID: Document ID * */ void InsertOrCount(HashTable HT, ElementType E, int docID, int termPosition) { Position p = Find(HT, E); //First, check whether E exists in table HT if (HT->Cells[p].info != Legitimate) { //If the location is not occupied, insert HT->Cells[p].info = Legitimate; HT->Cells[p].wordTerm = E; HT->Cells[p].docFrequency = 1; HT->Cells[p].table = (InvertedTable) malloc(sizeof(struct InvertedTable)); HT->Cells[p].table->docID = docID; //Record docID HT->Cells[p].table->frequency = 1; //Record the frequency of occurrences in this document HT->Cells[p].table->positions = (TermPositions) malloc(sizeof(struct TermPosition)); HT->Cells[p].table->positions->p = termPosition; HT->Cells[p].table->positions->next = NULL; HT->Cells[p].table->next = NULL; //Tag next is NULL } else { //This word item already exists in the dictionary. Judge whether there is a record of this document in the corresponding inverted list InvertedTable move = HT->Cells[p].table; //For traversing linked list int order = 0; //Used to record node location while (move != NULL && docID > move->docID) { //If the corresponding document ID is not found and the backward table traversal is not finished, the loop continues move = move->next; order++; } if (move != NULL && move->docID == docID) { //There are records of this document in the corresponding inverted table move->frequency++; //Word frequency plus one corresponding to inverted list node //Insert the location information, because the location is getting larger and larger, so just insert it directly to the end. And because of the existing records of this document, all cases in which the management link list is empty are not needed TermPositions positionMove = move->positions; while (positionMove->next != NULL) { positionMove = positionMove->next; } positionMove->next = (TermPosition) malloc(sizeof(struct TermPosition)); positionMove->next->p = termPosition; positionMove->next->next = NULL; } else { //There is no record of this document in the corresponding inverted table InvertedTable newNode = (InvertedTable) malloc(sizeof(struct InvertedTable)); //New inverted table node newNode->docID = docID; //Record document ID newNode->frequency = 1; //Initialize word frequency newNode->positions = NULL; AddNodeToTermP(&newNode->positions, termPosition); InsertNode(HT->Cells[p].table, newNode, order); HT->Cells[p].docFrequency++; //Document frequency of word item plus 1 } } }
Inverted index data structure
-
List of word position
typedef struct TermPosition { int p; struct TermPosition *next; } *TermPosition; typedef TermPosition TermPositions;
-
Inverted record table: single chain table
//Inverted record form typedef struct InvertedTable{ int docID; //Document ID int frequency; //Frequency of occurrence in this document struct InvertedTable * next; //Point to next node } * InvertedTable;
-
Inverted index: hash table
typedef unsigned int Position; typedef char *ElementType; //Used to mark the status of a node typedef enum Info { Legitimate, Empty, Deleted } Info; //Node of hash table, inverted index corresponding to a word item struct TblNode { ElementType wordTerm; //Lexical entry int docFrequency; //Document frequency InvertedTable table; //Inverted record of a word Info info; //state }; typedef struct TblNode *Cell; //Hashtable typedef struct HashTable { int TableSize; //Table size Cell Cells; //Node array } *HashTable;
Boolean query
And operation: merge inverted table
/* * Take intersection of two inverted tables, i.e. parallel operation * t1, t2: Two inverted tables * * return: Intersection of two tables * */ InvertedTable Intersect(InvertedTable t1, InvertedTable t2) { InvertedTable answer = NULL; while (t1 != NULL && t2 != NULL) { if (t1->docID == t2->docID) { //Same document ID AddNode(&answer, t1); t1 = t1->next; t2 = t2->next; } else if (t1->docID < t2->docID) { //Because the docids in the inverted table are sorted in order, all can be postponed directly t1 = t1->next; } else { t2 = t2->next; } } return answer; }
Or operation: the union of two inverted tables, similar to the above
Non operation: fetching complement from inverted table, similar to above
Parse the Boolean query instruction and calculate the result:
Several key functions used (stack parentheses and double quotes (phrases)):
//Data structure: used to store expressions, in which the inverted record table is the operand typedef struct ExpItem{ int type; //0 for the head of the list, 1 for the operation value, 2 for the operator InvertedTable table; char operator; struct ExpItem *next; } *ExpItem; typedef struct ExpItem *Exp; /* * Analyze infix expression and calculate result * midExp: Infix expression * * return: Calculated inverted index table * */ InvertedTable ComPuteMidExp(char *midExp) { char buffer[100], *sub, ch; //buffer is used to temporarily store entries, and move is used to traverse expressions int chIndex = 0, expIndex = 0, qufanFlag = 0, cizuFlag = 0; InvertedTable expItem; Position p; Stack S = CreateStack(100); //Create a stack //Create expression Exp exp = (Exp) malloc(sizeof(struct ExpItem)); //Use to store expressions that remove parentheses and exp->type = 0; exp->next = NULL; //China & ! (public | ! (country | you)) while (midExp[expIndex] != '\0') { //Read each character of the command in turn and make a judgment ch = midExp[expIndex]; if (!(IsOperator(ch) || ch == '(' || ch == ')' || ch == ' ' || ch == '"') && IsEmpty(S)) { //The character is not an operator and is not in parentheses buffer[chIndex++] = ch; //Buffering } else { if (chIndex != 0 && IsEmpty(S)) { buffer[chIndex] = 0; //Add 0 at the end of the term p = Find(HT, buffer); //Get the position of the inverted index table of terms // printf("item: %s position: %d\n", buffer, p); expItem = HT->Cells[p].table; if(qufanFlag == 1){ expItem = Complement(expItem); qufanFlag = 0; } InsertItem(exp, expItem); //Add operands to expressions chIndex = 0; } if (IsOperator(ch) && IsEmpty(S)) { //If the character is an operation symbol and is not in parentheses if(ch == '!'){ qufanFlag = 1; } else { InsertOperator(exp, ch); //Add operator to expression } } else if (midExp[expIndex] == '(') { Push(S, expIndex); //The position of the left bracket is pushed } else if (midExp[expIndex] == ')') { int left = Pop(S) + 1; //Left is the position of the first character to the right of the left bracket, and endIndex is the position of the right bracket if(IsEmpty(S)){ char temExp[100]; strncpy(temExp, midExp + left, expIndex - left); temExp[expIndex - left] = '\0'; expItem = ComPuteMidExp(temExp); if(qufanFlag == 1){ expItem = Complement(expItem); qufanFlag = 0; } InsertItem(exp, expItem); //Add operands to expressions } } else if (midExp[expIndex] == '\"'){ if(cizuFlag == 0){ Push(S, expIndex); //Stack the position of the opening quotation mark cizuFlag = 1; } else { int left = Pop(S) + 1; //left is the position of the first character to the right of the opening quotation mark, and endIndex is the position of the closing quotation mark if (IsEmpty(S)){ char temExp[100]; strncpy(temExp, midExp + left, expIndex - left); temExp[expIndex - left] = '\0'; expItem = ComputeCiZuExp(temExp); if(qufanFlag == 1){ expItem = Complement(expItem); qufanFlag = 0; } InsertItem(exp, expItem); //Add operands to expressions } cizuFlag = 0; } } } expIndex++; } if (chIndex != 0) { //If there is a word at the end of the command buffer[chIndex] = 0; //Add 0 at the end of the term p = Find(HT, buffer); //Get the position of the inverted index table of terms expItem = HT->Cells[p].table; if(qufanFlag == 1){ expItem = Complement(expItem); qufanFlag = 0; } InsertItem(exp, expItem); //Add operands to expressions } return ComPuteSimpleExp(exp); } /* * Computes a simple expression of & and | * */ InvertedTable ComPuteSimpleExp(Exp exp) { char operator; ExpItem expMove = exp->next; InvertedTable item1 = NULL, item2 = NULL; if(expMove->type == 1 && expMove->next == NULL){ return expMove->table; } while (expMove != NULL) { if (expMove->type == 1) { if (item1 == NULL) { //The first term of binary operation item1 = expMove->table; } else { //Second term of binary operation, calculated value item2 = expMove->table; switch (operator) { case '&': item1 = Intersect(item1, item2); break; case '|': item1 = Union(item1, item2); break; } } } else if (expMove->type == 2) { operator = expMove->operator; } expMove = expMove->next; } return item1; }
test case
Input string on the left and retrieved document on the right ID China: 1 2 6 8 public: 2 5 country: 0 2 4 5 6 8 you: 0 1 2 3 4 8 China | public & country: 2 5 6 8 China & (public | (country | you)): 1 2 6 8
Phrase query
**Principle: * * stores the position of the words in the document in the inverted index, and merges the inverted table based on the position information during the query.
/* * nearest neighbor search * t1, t2: Inverted record of two words * * return: Consolidated inverted record * */ InvertedTable PositionalIntersect(InvertedTable t1, InvertedTable t2){ InvertedTable answer = NULL; while(t1 != NULL && t2 != NULL){ if(t1->docID == t2->docID){ TermPosition p1 = t1->positions, p2 = t2->positions, p0 = NULL; int pNum = 0; while(p1 != NULL && p2 != NULL){ if(p2->p == p1->p + 1){ AddNodeToTermP(&p0, p2->p); pNum++; p1 = p1->next; p2 = p2->next; } else if (p2->p <= p1->p){ p2 = p2->next; } else { p1 = p1->next; } } if(p0 != NULL){ InvertedTable newNode = (InvertedTable) malloc(sizeof(struct InvertedTable)); newNode->docID = t2->docID; newNode->positions = p0; newNode->frequency = pNum; newNode->next = NULL; if (answer == NULL) { //answer inserts the node for the first time answer = newNode; //Get the node of deep copy directly with the header } else { InvertedTable tail = answer; while (tail->next != NULL) { tail = tail->next; } tail->next = newNode; } } t1 = t1->next; t2 = t2->next; } else if (t1->docID < t2->docID){ t1 = t1->next; } else { t2 = t2->next; } } return answer; }
test case
Input string on the left and retrieved document ID on the right "father she had been raped": 5 "Accelerating deforestation and rampant": 3 China & (public | "father she had been raped") | "Accelerating deforestation and rampant": 3
Free text query based on vector space model
Principle:
- Let tf be the word frequency of a word item in a document,
- df is the document frequency of a word item in the document set, idf is the inverse document frequency, and the expression is log(N / df), where N is the number of copies of all documents
- The weight w of a word item in a document can be expressed as: tf * idf
- The weight between a document and all terms in the dictionary constitutes a vector, which is called document vector
- The similarity between two documents can be expressed by the cosine of the included angle of their document vectors, and the expression is (V1 * V2) / (әә* әv2|)
- If a free text query is regarded as a document, its query vector can be obtained. Then the correlation degree between a query and a document can be expressed by the similarity between the query vector and the document vector
- Calculate the similarity between query vector and all document vectors, and output the K document ID S with the highest similarity (which can be realized by the maximum heap)
code implementation
//data structure //Nodes of query vector typedef struct QueryNode{ char * item; //Lexical entry int frequency; //Frequency of occurrence in free text, i.e. tf value //The idf value is directly used as the data in the inverted index struct QueryNode * next; } * QueryNode; //Query vector typedef struct QueryVector{ QueryNode first; int length; } * QueryVector; //Algorithm implementation /* * A free text query * */ void FreeTextSearch() { char query[100]; //command is used to store commands and buffer is used to temporarily store entries int k = 3; while(1){ printf("please enter the query (enter q to return):\n"); gets(query); if(strcmp(query, "q") == 0){ break; } QueryVector queryVector = GetQueryVector(query); //Calculate query vector OutputTopKDocs(queryVector, k); //Output the top five document ID S with the highest correlation } } /* * Based on the vector space model, the first k document ID S with the highest relevance are output * */ void OutputTopKDocs(QueryVector queryVector, int k) { int *docIDs = (int *) malloc(k * sizeof(int)); float *lengths, *Scores = (float *) malloc(IDLength * sizeof(float)); InvertedTable move; QueryNode queryMove = queryVector->first; Position p; for(int i = 0; i < IDLength; i++){ //Initialization Scores[i] = 0; } lengths = ComputeVectorLength(); //Get the length of all document vectors //Traverse the query vector and accumulate the inner product of the query vector and each document vector for(int i = 0; i < queryVector->length; i++){ p = Find(HT, queryMove->item); float idf = log10f((float)IDLength / (float)HT->Cells[p].docFrequency); //Inverted document frequency, only one value for each term float queryWf = (float)queryMove->frequency * idf; //The weight of the term in the query vector //Traversing the inverted record table of the term move = HT->Cells[p].table; while(move != NULL){ float docWf = (float)move->frequency * idf; //The weight of a word item in a document Scores[move->docID] += queryWf * docWf; move = move->next; } queryMove = queryMove->next; } //Divide each inner product by the length of the corresponding document vector for(int i = 0; i < IDLength; i++){ //Initialization Scores[i] = Scores[i] / lengths[i]; } for(int i = 0; i < IDLength; i++){ printf("doc%d's scores: %f\n", i, Scores[i]); } HeapSort(Scores, IDLength, k); //Output k documents with the highest score free(lengths); free(Scores); free(docIDs); } /* * Calculate query vector of query text * query: User entered free text * * return Query vector corresponding to free text * */ QueryVector GetQueryVector(char *query) { char buffer[21], ch; //Buffer, up to 20 words read int queryIndex = 0, bufferIndex = 0; QueryVector queryVector = InitQueryVector(); while (query[queryIndex] != '\0') { ch = query[queryIndex]; if ((ch < 48 || (ch >= 58 && ch <= 64) || (ch >= 91 && ch <= 96 && ch != 95) || ch > 122) && bufferIndex != 0) { //If the character read is a separator and the entry has been read if (bufferIndex >= 3) { // If the length of the entry is greater than or equal to 3, stop reading and insert the information into the query vector. Otherwise, discard the entry buffer[bufferIndex] = '\0'; InsertOrCountToQuery(queryVector, buffer); } bufferIndex = 0; } else { //If the read character is not a separator, it will be stored in the buffer buffer[bufferIndex++] = ch; } queryIndex++; } if (bufferIndex >= 3) { // If there is an entry with length greater than or equal to 3 at the end of the string, insert its information into the query vector buffer[bufferIndex] = '\0'; InsertOrCountToQuery(queryVector, buffer); } return queryVector; } /* * Initialize a query vector * */ QueryVector InitQueryVector() { QueryVector queryVector = (QueryVector) malloc(sizeof(struct QueryVector)); queryVector->first = NULL; queryVector->length = 0; return queryVector; } /* * Insert a word item and its information into the query vector. If it already exists, increase the corresponding word frequency * */ void InsertOrCountToQuery(QueryVector queryVector, char *item) { QueryNode move = queryVector->first; int i; for (i = 0; i < queryVector->length; i++) { if (strcmp(move->item, item) == 0) { //If the term already exists in the query vector, increase the word frequency move->frequency++; break; } move = move->next; } if (i == queryVector->length) { //If there is no such term in the query vector, it will be directly inserted into the header //Newly build QueryNode newNode = (QueryNode) malloc(sizeof(struct QueryNode)); newNode->item = (char *) malloc((strlen(item) + 1) * sizeof(char)); strcpy(newNode->item, item); newNode->frequency = 1; //insert newNode->next = queryVector->first; queryVector->first = newNode; queryVector->length++; } } /* * Calculate the length of all document vectors * HT: Inverted index * * return: Length of all document vectors, array * */ float *ComputeVectorLength() { float *lengths = (float *) malloc(IDLength * sizeof(int)); InvertedTable move; for(int i = 0; i < IDLength; i++){ //Initialization lengths[i] = 0; } //Traversing the dictionary and accumulating the length of the document vector for(int i = 0; i < HT->TableSize; i++){ if(HT->Cells[i].info == Legitimate){ //If there is a word item in this position float idf = log10f((float)IDLength / (float)HT->Cells[i].docFrequency); //Reverse document frequency move = HT->Cells[i].table; while(move != NULL){ float wf = (float)move->frequency * idf; //The weight of a word item in a document lengths[move->docID] += wf * wf; move = move->next; } } } for(int i = 0; i < IDLength; i++){ //Square root lengths[i] = sqrtf(lengths[i]); } return lengths; }
test case
From document 7.txt Copy a paragraph from //Input: further action will be taken against //Output: doc0's scores: 0.028482 doc1's scores: 0.003112 doc2's scores: 0.035071 doc3's scores: 0.008099 doc4's scores: 0.010380 doc5's scores: 0.000000 doc6's scores: 0.005722 doc7's scores: 0.124365 doc8's scores: 0.016283 doc9's scores: 0.014555 Top K: docId: 7 scores: 0.124365 docId: 2 scores: 0.035071 docId: 0 scores: 0.028482
Some ways to optimize the program
- Preprocessing entries
- Set the sequence of Boolean query operations (merge the small inverted table first)
- Index compression