scel2txt search dog scel format to TXT Python 3

Keywords: Python encoding

Because the original code does not apply to Python 3 and has a lot of bug s
And there are functions that are unnecessary to use and some code writing is not standardized or redundant
So I made a lot of detailed changes when the original big frame was basically static.
Make no random code appear, add folder import and so on.

When in use, you can change the input and output paths.

import struct
import os

# Because the original code does not apply to Python 3 and has a lot of bug s
# And there are functions that are unnecessary to use and some code writing is not standardized or redundant
# So I have made a lot of changes in details when the original big frame is basically fixed.
# Make no scrambling, folder import more convenient and so on.
# Author: Ling Yue, Taiyuan U of Tech
# Blog: http://blog.yueling.me


# Original author:
# Sogou's scel lexicon is the unicode encoding of the saved text, one character per two bytes (Chinese characters or English letters).
# Find out the offset of each part.
# Major two parts
# 1. Global phonetic alphabet, seemingly all phonetic combinations, dictionary order
#       Lists in the form of (index,len,pinyin)
#       Index: An integer of two bytes represents the index of the Pinyin.
#       len: The byte length of two-byte integer Pinyin
#       pinyin: Current pinyin, two bytes per character, total length len
#
# 2. Chinese Phrase List
#       A list in the format (same,py_table_len,py_table,{word_len,word,ext_len,ext})
#       same: Number of two byte integer homonyms
#       py_table_len: Two byte integers
#       py_table: List of integers, two bytes per integer, each representing an index of Pinyin
#
#       word_len: Two byte integers represent the byte length of Chinese phrases
#       Word: Chinese phrase, two bytes per Chinese character, total length word_len
#       ext_len: Two byte integers represent the length of the extended information, as if they were both 10
#       ext: The first two bytes of extended information are an integer (I don't know if it's word frequency) and the last eight bytes are all zero?
#
#      {word_len,word,ext_len,ext} Repeat the same phonetic alphabet of the same sub-homonym


# Pinyin table offset,
startPy = 0x1540;

# Chinese phrase list deviation
startChinese = 0x2628;

# Global Pinyin Table
GPy_Table = {}

# Analytical results
# List of tuples (word frequency, pinyin, Chinese phrases)
GTable = []

# Converting the original bytecode to a string
def byte2str(data):
    pos = 0
    str = ''
    while pos < len(data):
        c = chr(struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0])
        if c != chr(0):
            str += c
        pos += 2
    return str

# Getting Pinyin Tables
def getPyTable(data):
    data = data[4:]
    pos = 0
    while pos < len(data):
        index = struct.unpack('H', bytes([data[pos],data[pos + 1]]))[0]
        pos += 2
        lenPy = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
        pos += 2
        py = byte2str(data[pos:pos + lenPy])

        GPy_Table[index] = py
        pos += lenPy

# Get the Pinyin of a phrase
def getWordPy(data):
    pos = 0
    ret = ''
    while pos < len(data):
        index = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
        ret += GPy_Table[index]
        pos += 2
    return ret

# Reading Chinese Table
def getChinese(data):
    pos = 0
    while pos < len(data):
        # Number of homonyms
        same = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]

        # Length of Pinyin Index Table
        pos += 2
        py_table_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]

        # Pinyin Index Table
        pos += 2
        py = getWordPy(data[pos: pos + py_table_len])

        # Chinese phrase
        pos += py_table_len
        for i in range(same):
            # Chinese phrase length
            c_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
            # Chinese phrase
            pos += 2
            word = byte2str(data[pos: pos + c_len])
            # Extended data length
            pos += c_len
            ext_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
            # word frequency
            pos += 2
            count = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]

            # Preservation
            GTable.append((count, py, word))

            # The offset to the next word
            pos += ext_len


def scel2txt(file_name):
    print('-' * 60)
    with open(file_name, 'rb') as f:
        data = f.read()

    print("Thesaurus Name:", byte2str(data[0x130:0x338])) # .encode('GB18030')
    print("Lexicon types:", byte2str(data[0x338:0x540]))
    print("Descriptive information:", byte2str(data[0x540:0xd40]))
    print("Lexicon examples:", byte2str(data[0xd40:startPy]))

    getPyTable(data[startPy:startChinese])
    getChinese(data[startChinese:])

if __name__ == '__main__':

    # The folder path where scel is located
    in_path = "/Users/really/Documents/coal_dict"
    # The folder path where the output dictionary is located
    out_path = "coal_dict.txt"

    fin = [fname for fname in os.listdir(in_path) if fname[-5:] == ".scel"]
    for f in fin:
        scel2txt(f)

    # Save results
    with open(out_path, 'w', encoding='utf8') as f:
        f.writelines([word+'\n' for count, py, word in GTable])

Reference Code Framework Author Link: http://blog.csdn.net/zhangzhenhu/article/details/7014271

Posted by Henks on Sat, 09 Feb 2019 02:21:17 -0800