Data structure string (explain KMP algorithm in detail)

Keywords: Windows less P4 Asterisk

1. Definition of string:
A finite sequence of zero or more characters, also known as a string
Empty string without characters
A string with only spaces is called a space string
2. Abstract data type of string:

ADT String ( string)
Data
  //The element of a string consists of only one character, and the adjacent elements have the relationship of precursor and successor
  Operation
  StrAssign(T,*chars)//Generate a string T whose value is equal to the string constant chars
  StrCopy(T,S)//String S has T copied from string S
  ClearString(S)//Empty string if string S exists
  StringEmpty(S)//If string S is empty, return true, otherwise return false
  StrLength(S)//Return the length of the string
  StrCompare(S,T)//If s > t, return to > 0, if S==T, return to 0, otherwise return to < 0;
  Concat(T,S1,S2)//Use T to return a new string connected by S1 and S2
  SubString(Sub,S,pos,len)//Use sub to return the string with len as the pos character in string S
  Index(S,T,pos)//String S and T exist, and T is a non empty string. If there is a string with the same value as string T in main string S, it will return its first position in main string, otherwise it will return 0
  Replace(S,T,V)//If three strings exist at the same time, T is a non empty string, and V is used to replace all the equal and non overlapping substrings in S string
  StrInsert(S,pos,T)//If string S and T exist, insert string T before the pos character of string S
  StrDelete(S,pos,len)//If S exists, delete the string with length of len from the pos character of S string
  endADT                                                             

3. The specific implementation code of the sequential storage structure of the string:
Don't look, it's over a hundred lines. I've been writing for a day

#include<stdio.h>
#include<bits/stdc++.h>
#include<windows.h>
#define max 100
typedef char string[max+1];
int strassign(string a,char *chars)//Generate a string 
{
	if(strlen(chars)>max)
	{
		printf("The subscript of the string is out of range!\n");
		return 0;
	}
	a[0]=strlen(chars);
	for(int i=0;i<a[0];i++)
	{
		a[i+1]=chars[i];
	}
	return 1;
}
int strcopy(string t,string s)//Copy one string to another (t to s) 
{
   s[0]=t[0];
   for(int i=1;i<=s[0];i++)
   {
   	s[i]=t[i];
   }
   return 0; 
}
int clearstring(string s)//Empty the string 
{
	s[0]=0;
	memset(s,0,sizeof(s));
	return 1;
}
int stringempty(string s)//Judge whether the string is empty 
{
	if(s[0]==0)
	{
		printf("empty\n"); 
		return 1;
	}
	printf("Not empty\n"); 
	return 0;
}
int stringlen(string s)//Returns the length of the string 
{
	return s[0];
 } 
int stringcompare(string s,string t)//Compare the size of two strings 
{

	for(int i=1;i<=s[0];i++)
	{
		if(s[i]>t[i])
		{
			//printf("string 1 is larger than string 2 \ n");
		    return 1;
		}
		else if(s[i]<t[i])
		{
			// printf("string 1 is smaller than string 2 \ n");
		     return -1;
		} 
	
		else if(s[i]==t[i]&&i==s[0]&&i<t[0])
		{
		//	printf("string 1 is smaller than string 2 \ n");
			return -1;	
		} 
		else if(s[i]==t[i]&&i==t[0]&&i<s[0])
		{
		//	printf("string 1 is larger than string 2 \ n");
			return 1;
		} 
		else if(s[i]==t[i]&&i==s[0]&&i==t[0]) 
		{
		//	printf("two strings are the same \ n");
			return 0;
		} 
		else if(s[i]==t[i])
		continue;
	} 
} 
int concat(string s,string t1,string t2)//Connecting two strings 
{
	if(t1[0]+t2[0]>max)
	return 0;
	s[0]=t1[0]+t2[0];
	int cnt=t1[0]+1;
	for(int i=1;i<strlen(t1);i++)
	{
		s[i]=t1[i];
	}
	for(int i=1;i<=t2[0];i++){
		s[cnt++]=t2[i];
	}
	return 1;
}
int substring(string sub,string s,int pos,int len)//Get the length of the substring with the length of len from the pos character of s to sub 
{

	sub[0]=len;
	int cnt=1;
	for(int i=pos;i<pos+len;i++)
	{
		sub[cnt++]=s[i];
	
	}
	return 1;
}
void trav(string s)//Ergodic string 
{
	if(s[0]==0)
	{
		printf("This string is empty.\n");
		return ;
	}
	printf("The length of this string is:%d\n",(int)s[0]);
	printf("This string is:");
	for(int i=1;i<=s[0];i++)
	{
		printf("%c",s[i]);
	}
	printf("\n");
}
int index(string s,string t,int pos)//Determine whether there is substring t in s 
{
	if(t[0]>s[0])
	return 0;
	int m=s[0];
	int n=t[0];
	int j=pos;
	string subb;
	for(int i=j;i<=m-n+1;i++)
	{
		substring(subb,s,i,n);
		if(stringcompare(subb,t)==0)
	      return i;
	      else
	      continue;
	} 
 	return 0;
} 
int StrInsert(string S, int pos, string T)//Insert at a location 
{
	  int i;
	  for (i = S[0]; i >= pos; i--)
	  S[i + T[0]] = S[i];
	  for (i = pos; i<pos + T[0]; i++)
	  S[i] = T[i - pos + 1];
	  S[0] = S[0] + T[0];
	  return 0;
}
int stringdelete(string s,int pos,int len)//Delete len string from pos position in s String 
{
	int  cnt=s[0];
	s[0]=s[0]-len;
    for(int i=pos;i<cnt;i++)
    {
    	s[i]=s[i+len];                         
	}
	return  1;
}
int stringreplace(string s,string t,string v)//Change t-String in s to v-string 
{
  	for(int i=1;i<=s[0]-t[0]+1;i++)
	{
		string sub;
	    substring(sub,s,i,t[0]);
		if(stringcompare(sub,t)==0)
		{
			stringdelete(s,i,t[0]);
			StrInsert(s,i,v);
				i=i+t[0]-1;
		}
	}	
}
int main()
{
   char arr1[20];
   char crr1[20];
   char brr1[20];
   scanf("%s%s%s",arr1,brr1,crr1);
   string arr;
   string crr;
   string brr;
   strassign(arr,arr1);
   strassign(brr,brr1);
   strassign(crr,crr1);
    stringreplace(arr,brr,crr);
    trav(arr);  
}

4. Chain storage structure of string:
Compared with the linked storage structure table of linear table, it is not as flexible as the sequential structure in general.
5.KMP pattern matching algorithm:
(1) First, let's look at the simple matching method:

It can be seen that the time complexity is O(m*n), which is very inefficient.
(2) KMP pattern matching algorithm (this thing makes my brain ache when I just read):
Purpose: KMP algorithm is used to match whether there is the same string and pattern string in the main string
Its advantage: less time complexity than the common method
Let's start with an example:

In the simple algorithm, the j-th bit of P is mismatched, and by default, the p-string is moved one bit backward.
However, in the previous round of comparison, we have known that the first (j-1) bit of P and the corresponding (j-1) element in the middle of S have been matched successfully. This means that in a round of trial matching, we get some contents of the main string. With these contents, we can make p move several more bits (I think this is the most fundamental thing of KMP algorithm)
(3) Next, we introduce the maximum common length of Prefix suffix:
Prefix: a continuous substring containing the first character but not the last
Suffix: a continuous substring containing the last character but not the first
For string abcab
Prefix
1)a
2)ab
3)abc
4)abca
Suffix
1)b
2)ab
3)cab
4)bcab
For this example, the longest string length is ab
(4)next array:
**next[j] is the maximum Prefix suffix of the string formed by the first j-1 element before the j element plus 1; * * then the number of elements in the next array is the length of the matched string, specifying next[1]=0,
If the maximum common length of the prefix and suffix is 0, then the array element is 1
Then we write the next array of a string:

(5)
Specific implementation code:

#include<stdio.h>
#include<string.h>
#define max 100
typedef char string[max];
int strassign(string a,char *chars)//Generate a string 
{
	if(strlen(chars)>max)
	{
		printf("The subscript of the string is out of range!\n");
		return 0;
	}
	a[0]=strlen(chars);
	for(int i=0;i<a[0];i++)
	{
		a[i+1]=chars[i];
	}
	return 1;
}
void get_next(string T,int *next)
{
	int i;
	int j;
	i=1;
	j=0;
	next[1]=0;
	while(i<T[0])
	{
		if(j==0||T[i]==T[j])
		{
			next[++i]=++j;
		}
		else
		j=next[j];
	}
}
int index_kmp(string s,string t,int pos) 
{
	int i=pos;
	int j=1;
	int next[20];
	get_next(t,next);
	while(i<=s[0]&&j<=t[0])
	{
		if(j==0||s[i]==t[j])
		{
			i++;
			j++;
		}
		else
		j=next[j];
	}
	if(j>t[0])
	{
		return i-t[0];
	}
	else
	return 0;
}
int main()
{
	char arr1[20];
	char brr1[20];
	printf("Please enter the main string:\n"); 
	scanf("%s",arr1);
	printf("Please enter matching string:\n");
	scanf("%s",brr1);
	string brr;
	string arr;
	strassign(arr,arr1);
	strassign(brr,brr1);
	printf("The position of this match string in the main string:\n");
    printf("%d",index_kmp(arr,brr,1));
	return 0;	
} 

(6) First, how to implement the next array in Code:
Find after the value of the first k elements of next: next[k+1]
1) next[k+1] is required, where k+1=17

2) If next[16]=8 is known, the element has the following relationship:

3) If P8=P16, it is obvious that next [17] = 8 + 1 = 9 (the maximum value of next[k+1])
4) If it is not equal, and if next[8]=4, then it has the following relationship

Mainly to prove:

5) Now we are judging that if P16=P4, then next[17]=4+1=5, otherwise, we will continue to recurs (this is the meaning of j=next[j] in get_next)
6) If next[4]=2, there is the following relationship

7) If P16=P2, then next[17]=2+1=3; otherwise, continue to take next[2]=1, next[1]=0; if there is no result at 0, then the recurrence ends, at this time, next[17]=1;
8) How to make the program find and return the corresponding subscript
i t should also be noted that the value of i does not become smaller (in this case, the main string pointer does not go back), only the value of j is changing all the time, and the termination condition is not * * j > t [0] *, because once there is a match until the last character of the pattern string is equal, it will enter if for + +, and when it is greater than t[0]+1=j, it will match to the same substring.
The improvement of KMP algorithm (the horizontal groove is still so...).... How can we improve our algorithm? I am tm:.... (the words of tiger and Wolf)
For example, main string: aaaabcde
Mode string: aaaax
In the first match, the fifth a of the pattern string does not match with the main string b, so the next array will be used for re matching. The result is the same as the simple algorithm, as shown in the figure:
In fact, since the fifth a of the pattern string does not match with b, the previous a does not need to match, so there is an inefficient side.
First code:

 void get_nextval(string T,int *nextval)
{
	int i;
	int j;
	i=1;
	j=0;
	nextval[1]=0;
	while(i<T[0])
	{
		if(j==0||T[i]==T[j])
		{
		     i++;
		     j++;
			if(T[i]!=T[j])
			nextval[i]=j;
			else
			nextval[i]=nextval[j];//******//
		}
		else
		j=nextval[j];
	}
}

The place marked with asterisk is the improvement made. This place means (from the start of the main string and the pattern string for example): if the b of the first main string does not match the fifth a of the pattern string, then that step represents that if the position of the fifth a will be replaced by the nextal of the fifth a, then if the replaced value is equal to the fifth a, then there is no need for comparison Well, there must be no match, so what should I do at this time? At this time, the value of nextval will be equal to the value of netval * of the fifth a, which is such a detour...
Then take this picture to explain:

At this time, if matching to 16 fails (the value of nextval of the 16th bit element is 8, and the value of nextval of the 8th bit element is 4), compare the element corresponding to the nextval of the 16th bit element (as can be seen from the figure) with the 16th bit element. If it is not equal, the value of nextval will be the same as the next value here. Otherwise, the value of nextval of the 16th bit element will be equal to the nex of the 8th bit The value of TVAL. Here you may also have the idea that how do you make sure that the elements of the 4th (that is, the subscript of the 8th nexttval corresponds to the value) are not equal to the 16th?
Then there will be a recursive idea, that is, the value of nextval is pushed from the front to the back. Since the netival of bit 8 has 4, then the element of bit 8 will not be equal to that of bit 4, and it will return to bit 8 and bit 16. Since the element of bit 8 and bit 16 are equal, then the element of bit 16 will not be equal to that of bit 16...
When it comes to this, I try my best. It really makes my skull ache....

Published 22 original articles, won praise 12, visited 762
Private letter follow

Posted by seavolvox on Sat, 22 Feb 2020 04:56:51 -0800