Disabled Word Detection

Keywords: C++ REST

utf8 encoded data can be directly used by the following code

The most critical step is to split the string into a single word, UTF-8 coded word, if there is only one byte, its maximum binary bit is 0; if it is multi-byte, its first byte starts from the highest bit, the number of consecutive binary bit values of 1 determines the number of bits it encodes, the rest of the bytes start with 10.

UTF-8 can be used up to six bytes.  

1 byte 0xxxxxx

2 bytes 110xxxxxx 10xxxxxx

3 bytes 1110 x x x x x 10 x x x x x 10 x x x x x x x x x x x x x

4 bytes 11110xx 10xx XXX 10xxxx 10xxxx 10xxxx 10xxxx 10xxxx

5 bytes 111110xx 10xxxxxx 10xxxxx 10xxxxx 10xxxxx 10xxxx 10xxxx 10xxxxx

6 bytes 1111110x 10xxxxxx 10xxxxx 10xxxxx 10xxxxx 10xxxx 10xxxxxx 10xxxxx 10xxxx 10xxxxx


The rest is simple.

1. Disabled Word Processing

2. String to be tested

(a) Split into words

b), capitalization to lowercase, letters and spaces to turn half-corner, remove excess spaces (English letters will have at most one space, Chinese should not have a space)

c) Traversing all the words of a string to check whether the disabled words corresponding to each word are in the string to be tested

#include <string>
#include <vector>
#include <map>
#include <set>
#include <iostream>
#include <sstream>
#include <string.h>
#include <stdio.h>

class CDisableWord
{
struct SDisableWord
{
	std::string	str;
};
typedef std::vector<SDisableWord> 	 VDW;
private:
	std::map<std::string, VDW>	m_mapDisableWord;
	std::set<std::string>		m_setAllDisableWord;

    //Special conversion
    std::map<std::string, std::string>  m_mapSpecialWord;
private:
	//Splitting strings into single words
	size_t SplitWord(const char* pSrc, unsigned int len, std::vector<std::string>& output);
    //Get the conversion word corresponding to the special word
    const std::string* GetSpecialWord(const std::string& str);
public:
    CDisableWord();

    //Setting Disabled Words
	void AddOneDisableWord(const std::string& str);

    //Detection
	bool CheckStr(const char* pSrc, unsigned int len);
	bool CheckStr(const std::string& str);
};

CDisableWord::CDisableWord()
{
	std::string qjdx[26] = {"A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"};
	std::string qjxx[26] = {"a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"};
	std::string dx = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
	std::string rst = "abcdefghijklmnopqrstuvwxyz";

    std::string str1 = "a";
    std::string str2 = "a";
	for(int i = 0; i < 26; i++)
	{
        str1[0] = rst[i];
        str2[0] = dx[i];

		m_mapSpecialWord[qjdx[i]] = str1;
		m_mapSpecialWord[qjxx[i]] = str1;
		m_mapSpecialWord[str2] = str1;
	}

	m_mapSpecialWord[" "] = std::string(" ");
}

//Splitting strings into single words
size_t CDisableWord::SplitWord(const char* pSrc, unsigned int len, std::vector<std::string>& output)
{
    std::string ch;
    unsigned char byte;
    for(unsigned int i = 0, wlen = 0; i < len; i += wlen)
    {
        byte = (unsigned char)pSrc[i];
        if (byte >= 0xFC)
            wlen = 6;  
        else if (byte >= 0xF8)
            wlen = 5;
        else if (byte >= 0xF0)
            wlen = 4;
        else if (byte >= 0xE0)
            wlen = 3;
        else if (byte >= 0xC0)
            wlen = 2;
        else
            wlen = 1;

        if(i + wlen > len)
            break;

        ch.clear();
        for(unsigned int j = 0; j < wlen; j++)
            ch += pSrc[i+j];

        output.push_back(ch);
    }

    return output.size();
}

//Get the conversion word corresponding to the special word
const std::string* CDisableWord::GetSpecialWord(const std::string& str)
{
    std::map<std::string, std::string>::iterator miter = m_mapSpecialWord.find(str);
    if(miter == m_mapSpecialWord.end())
        return NULL;

    return &(miter->second);
}

void CDisableWord::AddOneDisableWord(const std::string& str)
{
    if(m_setAllDisableWord.find(str) != m_setAllDisableWord.end())
        return;

    std::vector<std::string> output;
    if(SplitWord(str.c_str(), str.size(), output) == 0 || output[0].size() == 0)
        return;

    std::map<std::string, VDW>::iterator miter = m_mapDisableWord.find(output[0]);
    if(miter == m_mapDisableWord.end())
    {
        m_mapDisableWord[output[0]] = VDW();
        miter = m_mapDisableWord.find(output[0]);
    }

    if(miter == m_mapDisableWord.end())
        return;

    SDisableWord sdw;
    sdw.str = str;
    miter->second.push_back(sdw);
}

bool CDisableWord::CheckStr(const char* pSrc, unsigned int len)
{
    if(len == 0)
        return true;

    std::string str(pSrc, len);
    return CheckStr(str);
}

bool CDisableWord::CheckStr(const std::string& str)
{
    if(str.size() == 0)
        return true;

    std::vector<std::string> output;
    if(SplitWord(str.c_str(), str.size(), output) == 0 || output[0].size() == 0)
        return false;

    //Capital to lowercase
    for(size_t i = 0; i < output.size(); ++i)
    {
        const std::string* pStr = GetSpecialWord(output[i]);
        if(pStr)
            output[i] = *pStr;
    }


    std::string StrSrc = "";        //Converted string
    std::string StrDelSpace = "";	//Delete all spaces after non-English. Convert all capitals to lowercase

    std::set<std::string> sonly;
    for(size_t i = 0; i < output.size(); ++i)
    {
        sonly.insert(output[i]);
        StrSrc += output[i];

        bool bnoadd = false;
        if(i > 0 && output[i] == " ")
        {
            bnoadd = true;
            for(int j = int(i - 1); j >= 0; --j)
            {
                if(output[j] == " ")
                    continue;

                if(output[j].size() > 1)
                    bnoadd = false;
                else if(j + 1 == int(i)) //Leave a space for English characters
					bnoadd = false;

                break;
            }
        }

        if(!bnoadd)
       		StrDelSpace += output[i];
    }
    bool bSame = (StrDelSpace == StrSrc);

    std::set<std::string>::iterator siter = sonly.begin();
    for(; siter != sonly.end(); ++siter)
    {
        std::map<std::string, VDW>::iterator miter = m_mapDisableWord.find(*siter);
        if(miter == m_mapDisableWord.end())
            continue;

        for(size_t j = 0; j < miter->second.size(); ++j)
        {
            SDisableWord& sdw = miter->second[j];
            if(StrSrc.find(sdw.str) != std::string::npos)
                return false;
            else if(!bSame && StrDelSpace.find(sdw.str) != std::string::npos)
                return false;
        }
    }

    return true;
}

int main()
{
    CDisableWord cdw;

	//Setting Disabled Words
    std::string strdw[] = {"Chinese", "English", "test", "aabb", "measure try", "cc dd"};
    for(int i = 0; i < 6; i++)
        cdw.AddOneDisableWord(strdw[i]);

    while(1)
    {
        char s[51];
        std::cin.getline(s,50);

        if(cdw.CheckStr(s, strlen(s)))
            printf("Received:%s  No sensitive words\n", s);
        else
            printf("Received:%s  Sensitive word Sensitive word Sensitive word\n", s);
    }

    return 0;
}

// g++ -g -o DisableWord DisableWord.cpp


Posted by Dale on Thu, 31 Jan 2019 03:18:16 -0800