Because the original code does not apply to Python 3 and has a lot of bug s
And there are functions that are unnecessary to use and some code writing is not standardized or redundant
So I made a lot of detailed changes when the original big frame was basically static.
Make no random code appear, add folder import and so on.
When in use, you can change the input and output paths.
import struct
import os
# Because the original code does not apply to Python 3 and has a lot of bug s
# And there are functions that are unnecessary to use and some code writing is not standardized or redundant
# So I have made a lot of changes in details when the original big frame is basically fixed.
# Make no scrambling, folder import more convenient and so on.
# Author: Ling Yue, Taiyuan U of Tech
# Blog: http://blog.yueling.me
# Original author:
# Sogou's scel lexicon is the unicode encoding of the saved text, one character per two bytes (Chinese characters or English letters).
# Find out the offset of each part.
# Major two parts
# 1. Global phonetic alphabet, seemingly all phonetic combinations, dictionary order
# Lists in the form of (index,len,pinyin)
# Index: An integer of two bytes represents the index of the Pinyin.
# len: The byte length of two-byte integer Pinyin
# pinyin: Current pinyin, two bytes per character, total length len
#
# 2. Chinese Phrase List
# A list in the format (same,py_table_len,py_table,{word_len,word,ext_len,ext})
# same: Number of two byte integer homonyms
# py_table_len: Two byte integers
# py_table: List of integers, two bytes per integer, each representing an index of Pinyin
#
# word_len: Two byte integers represent the byte length of Chinese phrases
# Word: Chinese phrase, two bytes per Chinese character, total length word_len
# ext_len: Two byte integers represent the length of the extended information, as if they were both 10
# ext: The first two bytes of extended information are an integer (I don't know if it's word frequency) and the last eight bytes are all zero?
#
# {word_len,word,ext_len,ext} Repeat the same phonetic alphabet of the same sub-homonym
# Pinyin table offset,
startPy = 0x1540;
# Chinese phrase list deviation
startChinese = 0x2628;
# Global Pinyin Table
GPy_Table = {}
# Analytical results
# List of tuples (word frequency, pinyin, Chinese phrases)
GTable = []
# Converting the original bytecode to a string
def byte2str(data):
pos = 0
str = ''
while pos < len(data):
c = chr(struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0])
if c != chr(0):
str += c
pos += 2
return str
# Getting Pinyin Tables
def getPyTable(data):
data = data[4:]
pos = 0
while pos < len(data):
index = struct.unpack('H', bytes([data[pos],data[pos + 1]]))[0]
pos += 2
lenPy = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
pos += 2
py = byte2str(data[pos:pos + lenPy])
GPy_Table[index] = py
pos += lenPy
# Get the Pinyin of a phrase
def getWordPy(data):
pos = 0
ret = ''
while pos < len(data):
index = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
ret += GPy_Table[index]
pos += 2
return ret
# Reading Chinese Table
def getChinese(data):
pos = 0
while pos < len(data):
# Number of homonyms
same = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
# Length of Pinyin Index Table
pos += 2
py_table_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
# Pinyin Index Table
pos += 2
py = getWordPy(data[pos: pos + py_table_len])
# Chinese phrase
pos += py_table_len
for i in range(same):
# Chinese phrase length
c_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
# Chinese phrase
pos += 2
word = byte2str(data[pos: pos + c_len])
# Extended data length
pos += c_len
ext_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
# word frequency
pos += 2
count = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
# Preservation
GTable.append((count, py, word))
# The offset to the next word
pos += ext_len
def scel2txt(file_name):
print('-' * 60)
with open(file_name, 'rb') as f:
data = f.read()
print("Thesaurus Name:", byte2str(data[0x130:0x338])) # .encode('GB18030')
print("Lexicon types:", byte2str(data[0x338:0x540]))
print("Descriptive information:", byte2str(data[0x540:0xd40]))
print("Lexicon examples:", byte2str(data[0xd40:startPy]))
getPyTable(data[startPy:startChinese])
getChinese(data[startChinese:])
if __name__ == '__main__':
# The folder path where scel is located
in_path = "/Users/really/Documents/coal_dict"
# The folder path where the output dictionary is located
out_path = "coal_dict.txt"
fin = [fname for fname in os.listdir(in_path) if fname[-5:] == ".scel"]
for f in fin:
scel2txt(f)
# Save results
with open(out_path, 'w', encoding='utf8') as f:
f.writelines([word+'\n' for count, py, word in GTable])
Reference Code Framework Author Link: http://blog.csdn.net/zhangzhenhu/article/details/7014271