DFA algorithm C implementation

Keywords: C# Java encoding

Transported from: https://www.cnblogs.com/AlanLee/p/5329555.html

Principle search key: DFA algorithm

 

Basically copied the original JAVA code, which should be able to use dictionary < string, int > instead of Hashtable, but the searched data say that Hashtable is extremely fast. Although they know that they are talking about JAVA environment, they are too lazy to change it. This thing can be implemented without thread blocking.

Try it. It takes about 40 milliseconds to initialize a text of more than 19000 lines, and then search for more than 100 keywords in a text of about 20000 words (for random insertion test, it takes some effort to process the test text. The first version of random insertion will be inserted into the middle of the previously inserted keywords from time to time, resulting in the number of matching words always No), it only takes 7 milliseconds.

 

  1     /// <summary>
  2     /// Filter words DFA Algorithm implementation
  3     /// </summary>
  4     public class ForbiddentWordLibrary
  5     {
  6         /// <summary>
  7         /// Initializing the filter thesaurus with a branch filter word file
  8         /// </summary>
  9         /// <param name="path">File path</param>
 10         public ForbiddentWordLibrary( string path )
 11         {
 12             try
 13             {
 14                 words = new HashSet<string>();
 15                 using( var stream = new StreamReader( path, Encoding.UTF8 ) )
 16                 {
 17                     while( !stream.EndOfStream )
 18                     {
 19                         words.Add( stream.ReadLine().Trim() );
 20                     }
 21                 }
 22                 InitLibrary();
 23             }
 24             catch( Exception ex )
 25             {
 26                 throw ex;
 27             }
 28         }
 29 
 30         /// <summary>
 31         /// Find all sensitive words in the input string
 32         /// </summary>
 33         /// <param name="input"></param>
 34         /// <returns></returns>
 35         public List<string> GetAllForbiddenWords( string input )
 36         {
 37             List<string> result = new List<string>();
 38             for( int i = 0; i < input.Length; i++ )
 39             {
 40                 int length = SearchFW( input, i );
 41                 if( length > 0 )
 42                 {
 43                     result.Add( input.Substring( i, length ) );
 44                     i = i + length - 1;
 45                 }
 46             }
 47 
 48             return result;
 49         }
 50 
 51         /// <summary>
 52         /// Search the input string, find all sensitive words, and return the length of sensitive words if found
 53         /// </summary>
 54         /// <param name="input">Input string</param>
 55         /// <param name="beginIndex">Start of search</param>
 56         /// <returns></returns>
 57         private int SearchFW( string input, int beginIndex )
 58         {
 59             bool flag = false;
 60             int len = 0;
 61             Hashtable ht = lib;
 62             for( int i = beginIndex; i < input.Length; i++ )
 63             {
 64                 var c = input[ i ];
 65                 var obj = ht[ c.ToString() ];
 66                 if( obj == null )
 67                     break;
 68                 else
 69                 {
 70                     len++;
 71                     ht = (Hashtable)obj;
 72                     if( (int)ht[ "IsEnd" ] == 1 )
 73                         flag = true;
 74                 }
 75             }
 76 
 77             if( !flag )
 78                 len = 0;
 79 
 80             return len;
 81         }
 82 
 83         /// <summary>
 84         /// Initialize thesaurus structure
 85         /// </summary>
 86         private void InitLibrary()
 87         {
 88             lib = new Hashtable( words.Count );
 89             var tmp = lib;
 90             foreach( string k in words )
 91             {
 92                 for( int i = 0; i < k.Length; i++ )
 93                 {
 94                     var c = k[ i ].ToString();
 95                     if( tmp.ContainsKey( c ) )
 96                     {
 97                         tmp = (Hashtable)tmp[ c ];
 98                     }
 99                     else
100                     {
101                         var nht = new Hashtable();
102                         nht.Add( "IsEnd", 0 );
103                         tmp.Add( c, nht );
104                         tmp = nht;
105                     }
106 
107                     if( i == k.Length - 1 )
108                     {
109                         if( tmp.ContainsKey( "IsEnd" ) )
110                             tmp[ "IsEnd" ] = 1;
111                         else
112                             tmp.Add( "IsEnd", 1 );
113                     }
114                 }
115                 tmp = lib;
116             }
117         }
118 
119         /// <summary>
120         /// Original filter word data set
121         /// </summary>
122         private HashSet<string> words;
123         /// <summary>
124         /// Filter Thesaurus
125         /// </summary>
126         private Hashtable lib;
127     }    

Posted by PHP'er on Wed, 29 Apr 2020 10:24:37 -0700