Transported from: https://www.cnblogs.com/AlanLee/p/5329555.html
Principle search key: DFA algorithm
Basically copied the original JAVA code, which should be able to use dictionary < string, int > instead of Hashtable, but the searched data say that Hashtable is extremely fast. Although they know that they are talking about JAVA environment, they are too lazy to change it. This thing can be implemented without thread blocking.
Try it. It takes about 40 milliseconds to initialize a text of more than 19000 lines, and then search for more than 100 keywords in a text of about 20000 words (for random insertion test, it takes some effort to process the test text. The first version of random insertion will be inserted into the middle of the previously inserted keywords from time to time, resulting in the number of matching words always No), it only takes 7 milliseconds.
1 /// <summary> 2 /// Filter words DFA Algorithm implementation 3 /// </summary> 4 public class ForbiddentWordLibrary 5 { 6 /// <summary> 7 /// Initializing the filter thesaurus with a branch filter word file 8 /// </summary> 9 /// <param name="path">File path</param> 10 public ForbiddentWordLibrary( string path ) 11 { 12 try 13 { 14 words = new HashSet<string>(); 15 using( var stream = new StreamReader( path, Encoding.UTF8 ) ) 16 { 17 while( !stream.EndOfStream ) 18 { 19 words.Add( stream.ReadLine().Trim() ); 20 } 21 } 22 InitLibrary(); 23 } 24 catch( Exception ex ) 25 { 26 throw ex; 27 } 28 } 29 30 /// <summary> 31 /// Find all sensitive words in the input string 32 /// </summary> 33 /// <param name="input"></param> 34 /// <returns></returns> 35 public List<string> GetAllForbiddenWords( string input ) 36 { 37 List<string> result = new List<string>(); 38 for( int i = 0; i < input.Length; i++ ) 39 { 40 int length = SearchFW( input, i ); 41 if( length > 0 ) 42 { 43 result.Add( input.Substring( i, length ) ); 44 i = i + length - 1; 45 } 46 } 47 48 return result; 49 } 50 51 /// <summary> 52 /// Search the input string, find all sensitive words, and return the length of sensitive words if found 53 /// </summary> 54 /// <param name="input">Input string</param> 55 /// <param name="beginIndex">Start of search</param> 56 /// <returns></returns> 57 private int SearchFW( string input, int beginIndex ) 58 { 59 bool flag = false; 60 int len = 0; 61 Hashtable ht = lib; 62 for( int i = beginIndex; i < input.Length; i++ ) 63 { 64 var c = input[ i ]; 65 var obj = ht[ c.ToString() ]; 66 if( obj == null ) 67 break; 68 else 69 { 70 len++; 71 ht = (Hashtable)obj; 72 if( (int)ht[ "IsEnd" ] == 1 ) 73 flag = true; 74 } 75 } 76 77 if( !flag ) 78 len = 0; 79 80 return len; 81 } 82 83 /// <summary> 84 /// Initialize thesaurus structure 85 /// </summary> 86 private void InitLibrary() 87 { 88 lib = new Hashtable( words.Count ); 89 var tmp = lib; 90 foreach( string k in words ) 91 { 92 for( int i = 0; i < k.Length; i++ ) 93 { 94 var c = k[ i ].ToString(); 95 if( tmp.ContainsKey( c ) ) 96 { 97 tmp = (Hashtable)tmp[ c ]; 98 } 99 else 100 { 101 var nht = new Hashtable(); 102 nht.Add( "IsEnd", 0 ); 103 tmp.Add( c, nht ); 104 tmp = nht; 105 } 106 107 if( i == k.Length - 1 ) 108 { 109 if( tmp.ContainsKey( "IsEnd" ) ) 110 tmp[ "IsEnd" ] = 1; 111 else 112 tmp.Add( "IsEnd", 1 ); 113 } 114 } 115 tmp = lib; 116 } 117 } 118 119 /// <summary> 120 /// Original filter word data set 121 /// </summary> 122 private HashSet<string> words; 123 /// <summary> 124 /// Filter Thesaurus 125 /// </summary> 126 private Hashtable lib; 127 }