utf8 encoded data can be directly used by the following code
The most critical step is to split the string into a single word, UTF-8 coded word, if there is only one byte, its maximum binary bit is 0; if it is multi-byte, its first byte starts from the highest bit, the number of consecutive binary bit values of 1 determines the number of bits it encodes, the rest of the bytes start with 10.
UTF-8 can be used up to six bytes.
1 byte 0xxxxxx
2 bytes 110xxxxxx 10xxxxxx
3 bytes 1110 x x x x x 10 x x x x x 10 x x x x x x x x x x x x x
4 bytes 11110xx 10xx XXX 10xxxx 10xxxx 10xxxx 10xxxx 10xxxx
5 bytes 111110xx 10xxxxxx 10xxxxx 10xxxxx 10xxxxx 10xxxx 10xxxx 10xxxxx
6 bytes 1111110x 10xxxxxx 10xxxxx 10xxxxx 10xxxxx 10xxxx 10xxxxxx 10xxxxx 10xxxx 10xxxxx
The rest is simple.
1. Disabled Word Processing
2. String to be tested
(a) Split into words
b), capitalization to lowercase, letters and spaces to turn half-corner, remove excess spaces (English letters will have at most one space, Chinese should not have a space)
c) Traversing all the words of a string to check whether the disabled words corresponding to each word are in the string to be tested
#include <string> #include <vector> #include <map> #include <set> #include <iostream> #include <sstream> #include <string.h> #include <stdio.h> class CDisableWord { struct SDisableWord { std::string str; }; typedef std::vector<SDisableWord> VDW; private: std::map<std::string, VDW> m_mapDisableWord; std::set<std::string> m_setAllDisableWord; //Special conversion std::map<std::string, std::string> m_mapSpecialWord; private: //Splitting strings into single words size_t SplitWord(const char* pSrc, unsigned int len, std::vector<std::string>& output); //Get the conversion word corresponding to the special word const std::string* GetSpecialWord(const std::string& str); public: CDisableWord(); //Setting Disabled Words void AddOneDisableWord(const std::string& str); //Detection bool CheckStr(const char* pSrc, unsigned int len); bool CheckStr(const std::string& str); }; CDisableWord::CDisableWord() { std::string qjdx[26] = {"A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"}; std::string qjxx[26] = {"a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"}; std::string dx = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; std::string rst = "abcdefghijklmnopqrstuvwxyz"; std::string str1 = "a"; std::string str2 = "a"; for(int i = 0; i < 26; i++) { str1[0] = rst[i]; str2[0] = dx[i]; m_mapSpecialWord[qjdx[i]] = str1; m_mapSpecialWord[qjxx[i]] = str1; m_mapSpecialWord[str2] = str1; } m_mapSpecialWord[" "] = std::string(" "); } //Splitting strings into single words size_t CDisableWord::SplitWord(const char* pSrc, unsigned int len, std::vector<std::string>& output) { std::string ch; unsigned char byte; for(unsigned int i = 0, wlen = 0; i < len; i += wlen) { byte = (unsigned char)pSrc[i]; if (byte >= 0xFC) wlen = 6; else if (byte >= 0xF8) wlen = 5; else if (byte >= 0xF0) wlen = 4; else if (byte >= 0xE0) wlen = 3; else if (byte >= 0xC0) wlen = 2; else wlen = 1; if(i + wlen > len) break; ch.clear(); for(unsigned int j = 0; j < wlen; j++) ch += pSrc[i+j]; output.push_back(ch); } return output.size(); } //Get the conversion word corresponding to the special word const std::string* CDisableWord::GetSpecialWord(const std::string& str) { std::map<std::string, std::string>::iterator miter = m_mapSpecialWord.find(str); if(miter == m_mapSpecialWord.end()) return NULL; return &(miter->second); } void CDisableWord::AddOneDisableWord(const std::string& str) { if(m_setAllDisableWord.find(str) != m_setAllDisableWord.end()) return; std::vector<std::string> output; if(SplitWord(str.c_str(), str.size(), output) == 0 || output[0].size() == 0) return; std::map<std::string, VDW>::iterator miter = m_mapDisableWord.find(output[0]); if(miter == m_mapDisableWord.end()) { m_mapDisableWord[output[0]] = VDW(); miter = m_mapDisableWord.find(output[0]); } if(miter == m_mapDisableWord.end()) return; SDisableWord sdw; sdw.str = str; miter->second.push_back(sdw); } bool CDisableWord::CheckStr(const char* pSrc, unsigned int len) { if(len == 0) return true; std::string str(pSrc, len); return CheckStr(str); } bool CDisableWord::CheckStr(const std::string& str) { if(str.size() == 0) return true; std::vector<std::string> output; if(SplitWord(str.c_str(), str.size(), output) == 0 || output[0].size() == 0) return false; //Capital to lowercase for(size_t i = 0; i < output.size(); ++i) { const std::string* pStr = GetSpecialWord(output[i]); if(pStr) output[i] = *pStr; } std::string StrSrc = ""; //Converted string std::string StrDelSpace = ""; //Delete all spaces after non-English. Convert all capitals to lowercase std::set<std::string> sonly; for(size_t i = 0; i < output.size(); ++i) { sonly.insert(output[i]); StrSrc += output[i]; bool bnoadd = false; if(i > 0 && output[i] == " ") { bnoadd = true; for(int j = int(i - 1); j >= 0; --j) { if(output[j] == " ") continue; if(output[j].size() > 1) bnoadd = false; else if(j + 1 == int(i)) //Leave a space for English characters bnoadd = false; break; } } if(!bnoadd) StrDelSpace += output[i]; } bool bSame = (StrDelSpace == StrSrc); std::set<std::string>::iterator siter = sonly.begin(); for(; siter != sonly.end(); ++siter) { std::map<std::string, VDW>::iterator miter = m_mapDisableWord.find(*siter); if(miter == m_mapDisableWord.end()) continue; for(size_t j = 0; j < miter->second.size(); ++j) { SDisableWord& sdw = miter->second[j]; if(StrSrc.find(sdw.str) != std::string::npos) return false; else if(!bSame && StrDelSpace.find(sdw.str) != std::string::npos) return false; } } return true; } int main() { CDisableWord cdw; //Setting Disabled Words std::string strdw[] = {"Chinese", "English", "test", "aabb", "measure try", "cc dd"}; for(int i = 0; i < 6; i++) cdw.AddOneDisableWord(strdw[i]); while(1) { char s[51]; std::cin.getline(s,50); if(cdw.CheckStr(s, strlen(s))) printf("Received:%s No sensitive words\n", s); else printf("Received:%s Sensitive word Sensitive word Sensitive word\n", s); } return 0; } // g++ -g -o DisableWord DisableWord.cpp