Python crawler learning - crawl the public service platform of construction market supervision

Keywords: SQL Database MySQL Windows

Python crawler learning - crawl the public service platform of construction market supervision

The code is only for learning and communication. Please do not use it for illegal purposes. In case of infringement, please click here Contact author delete

The code is only for learning and communication. Please do not use it for illegal purposes. In case of infringement, please click here Contact author delete

The code is only for learning and communication. Please do not use it for illegal purposes. In case of infringement, please click here Contact author delete

The code involves the response decryption, as well as the decryption algorithm of enterprise id and personnel id every day. Here, the code only provides learning communication, and will not be released.

It is busy in the near future, and there are few update development learning sharing, and there are development learning problems QQ click here Discuss with me

1, Code implementation

# -*- coding:utf-8 -*-
import requests
import re
import hashlib
from decrypter import decrypt, encryptId, decryptId
import configparser
import MySQLdb
import time
import random


cf = configparser.ConfigParser()
try:
    cf.read("config.ini")
except Exception as e:
    print("Program directory does not exist config.ini configuration file~")
    exit(0)


def getConf(sec, key):
    try:
        return cf.get(sec, key)
    except Exception as e:
        print("The following configuration is not available:" + sec + " - " + key)
        exit(0)

# -------------------------------------------------
offset = str(getConf("app-sys", "offset"))
limit = str(getConf("app-sys", "limit"))
# Database account
mysql_user = getConf("Mysql-Database", "user")
# Database password
mysql_password = getConf("Mysql-Database", "password")
# Database name
mysql_database = getConf("Mysql-Database", "database")
mysql_host = getConf("Mysql-Database", "host")
mysql_port = getConf("Mysql-Database", "port")
# token
token = getConf("web-param", "token")
min_sleep = int(getConf("app-sys", "min_sleep"))
max_sleep = int(getConf("app-sys", "max_sleep"))
timeout = 20
retry = 3
headers = {
    "Referer": "http://jzsc.mohurd.gov.cn/data/company",
    "timeout": "30000",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
    "accessToken": token
}


def getSleepTime():
    return float(random.randint(min_sleep, max_sleep) / 1000)


def execSql(sql):
    try:
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, port=int(mysql_port), database=mysql_database,
                                   charset='utf8')
            cursor = conn.cursor()
            cursor.execute(sql)
            conn.commit()
            return True
        except Exception as e:
            pass
    except Exception as e:
        pass
    return False


def querySql(sql):
    try:
        try:
            conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, port=int(mysql_port), database=mysql_database,
                                   charset='utf8')
            cursor = conn.cursor()
            cursor.execute(sql)
            return cursor.fetchall()
        except Exception as e:
            pass
    except Exception as e:
        pass
    return


def getHash(s):
    m = hashlib.md5()
    m.update(s.encode("utf-8"))
    return m.hexdigest()


def updateToken(url):
    global token
    global headers
    global cf
    while True:
        try:
            cf.read("config.ini")
        except Exception as e:
            print("Program directory does not exist config.ini configuration file~")
            exit(0)
        token = getConf("web-param", "token")
        print("token Invalid, please replace~\n at present token: %s" % token)
        time.sleep(5)
        headers = {
            "Referer": "http://jzsc.mohurd.gov.cn/data/company",
            "timeout": "30000",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0",
            "accessToken": token
        }
        try:
            resp = requests.get(url, headers=headers, timeout=timeout)
            content = resp.content.decode("utf-8")
            content = decrypt(content)
            code = str(content['code'])
            if code == "200":
                print("token Update success~")
                return
        except Exception as e:
            pass


def getHtml(url):
    for i in range(retry):
        # print("url: %s times: %d " % (url, i + 1))
        try:
           resp = requests.get(url, headers=headers, timeout=timeout)
           content = resp.content.decode("utf-8")
           content = decrypt(content)
           code = str(content['code'])
           if code == "408":
               updateToken(url)
               continue
           time.sleep(getSleepTime())
           return content
        except Exception as e:
            pass
    return


def getCompanyList(page):
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/list?pg=" + str(page) + "&pgsz=15&total=450"
    html = getHtml(url)
    if html:
        try:
            return html['data']['list']
        except Exception as e:
            pass
    return


def getAreas(s):
    req = re.compile("(.*)-(.*)")
    try:
        return re.findall(req, s)[0]
    except Exception as e:
        pass
    return


def tsToDate(ts):
    tsList = list(str(ts))
    tsList.insert(-3, ".")
    cuttTs = float("".join(tsList))
    time_local = time.localtime(cuttTs)
    return time.strftime("%Y{y}%m{m}%d", time_local).format(y='-', m='-')


def getCompanyDetail(id, tid):
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/compDetail?compId=" + id
    html = getHtml(url)
    print("Access to basic information of the enterprise~")
    if html:
        try:
            compMap = html['data']['compMap']
            print("Company name:%s " % compMap['QY_NAME'])
            addStatus = addCompany(tid, compMap['QY_NAME'])
            if addStatus:
                company = []
                company.append(tid)
                try:
                    company.append(compMap['QY_ORG_CODE'])
                except Exception as e:
                    company.append("")
                try:
                    company.append(compMap['QY_FR_NAME'])
                except Exception as e:
                    company.append("")
                QY_REGION_NAME = ""
                try:
                    QY_REGION_NAME = getAreas(compMap['QY_REGION_NAME'])
                except Exception as e:
                    pass
                try:
                    company.append(QY_REGION_NAME[0])
                except Exception as e:
                    company.append("")
                try:
                    company.append(QY_REGION_NAME[1])
                except Exception as e:
                    company.append("")
                try:
                    company.append(compMap['QY_ADDR'])
                except Exception as e:
                    company.append("")
                try:
                    company.append(compMap['QY_NAME'])
                except Exception as e:
                    company.append("")
                try:
                    company.append(compMap['QY_GSZCLX_NAME'])
                except Exception as e:
                    company.append("")
                print("Basic information of the enterprise:%s ~" % str(company))
                companyStatus = execSql("insert jianshe_qiyes(tid, hao, ren, diqu, city, qiye_dizhi, qiye_ming, qiye_leixing) values(%d, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (int(company[0]), company[1], company[2], company[3], company[4], company[5], company[6], company[7]))
                if companyStatus:
                    upStatus = updateCompany(tid, "qiye_jiben")
                    return True
                return False
        except Exception as e:
            pass
    print("The enterprise does not exist~")
    return False


def getCaDetailList(id, tid):
    '''
    //Enterprise qualification certificate information
    :param id:
    :return:
    '''
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/caDetailList?qyId=" + id + "&pg=0&pgsz=15"
    html = getHtml(url)
    print("Obtain enterprise qualification certificate information~")
    if html:
        data = ""
        try:
            data = html['data']['pageList']['list']
            for _ in data:
                ca = []
                ca.append(tid)
                try:
                    ca.append(getHash(str(tid) + _['APT_NAME']))
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(_['APT_CERTNO'])
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(_['APT_NAME'])
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(_['APT_TYPE_NAME'])
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(tsToDate(_['APT_GET_DATE']))
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(tsToDate(_['APT_EDATE']))
                except Exception as e:
                    ca.append("")
                try:
                    ca.append(_['APT_GRANT_UNIT'])
                except Exception as e:
                    ca.append("")
                sql = "insert jianshe_zizhis(tid, `hash`, hao, ming, leibie, riqi, youxiaoqi, jiguan) values(%d, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (int(ca[0]), ca[1], ca[2], ca[3], ca[4], ca[5], ca[6], ca[7])
                caStatus = execSql(sql)
                if caStatus:
                    updateCompany(tid, "qiye_zizhi")
                    return True
                return False
        except Exception as e:
            print("No enterprise qualification information~")
            return
    print("No enterprise qualification information~")
    return


def getRegStaffList(id, tid):
    '''
    //Enterprise personnel information
    :param id:
    :return:
    '''
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/regStaffList?qyId=" + str(id) + "&pg=0&pgsz=15"
    html = getHtml(url)
    print("Get new information of enterprise personnel~")
    if html:
        try:
            data = ""
            try:
                data = html['data']['pageList']['list']
            except Exception as e:
                return
            for _ in data:
                reg = []
                reg.append(tid)
                rid = ""
                try:
                    rid = int(decryptId(_['RY_ID']))
                    reg.append(rid)
                except Exception as e:
                    reg.append("")
                try:
                    reg.append(_['REG_SEAL_CODE'])
                except Exception as e:
                    reg.append("")
                try:
                    reg.append(_['RY_NAME'])
                except Exception as e:
                    reg.append("")
                try:
                    reg.append(_['IDCARD'])
                except Exception as e:
                    reg.append("")
                try:
                    reg.append(_['REG_TYPE_NAME'])
                except Exception as e:
                    reg.append("")
                try:
                    reg.append(_['REG_PROF_NAME'])
                except Exception as e:
                    reg.append("")
                print("Enterprise personnel information:%s ~" % str(reg))
                sql = "insert jianshe_qiyes_renyuans(tid, rid, hao, xingming, shenfengzheng, leibie, zhuanye) values(%d, %d, '%s', '%s', '%s', '%s', '%s')" % (reg[0], reg[1], reg[2], reg[3], reg[4], reg[5], reg[6])
                regStatus = execSql(sql)
                if regStatus:
                    updateCompany(tid, "qiye_renyuan")
                    # Basic information of personnel
                    getStaff(_['RY_ID'], rid, tid)
                    # Personnel practice registration information
                    getRegCert(_['RY_ID'], rid, tid)
                    # getRegCert("D2D2D3D4D3D2D2DAD3DBD2D1D1D1D3DAD4D6", rid, tid)
                    return
        except Exception as e:
            pass
    print("No enterprise personnel information~")
    return


def staffPerformanceListSys(id, rid, tid):
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffPerformanceListSys?staffId=" + str(id) + "&pg=0"
    resp = getHtml(url)
    if resp:
        data = ""
        try:
            data = resp['data']['pageList']['list']
        except Exception as e:
            return
        if data and len(data) > 0:
            for _ in data:
                per = []
                per.append(int(rid))
                try:
                    per.append(_['PRJNUM'])
                except Exception as e:
                    per.append("")
                try:
                    per.append(_['PRJNAME'])
                except Exception as e:
                    per.append("")
                try:
                    per.append(_['PROVINCE'] + _['CITY'])
                except Exception as e:
                    per.append("")
                try:
                    per.append(_['PRJTYPENUM'])
                except Exception as e:
                    per.append("")
                try:
                    per.append(_['BUILDCORPNAME'])
                except Exception as e:
                    per.append("")
                sql = "insert jianshe_renyuans_yejis(rid, xiangmu_bianhao, xiangmu_mingcheng, xiangmu_shudi, xiangmu_leibie, jianshe_danwei) values(%d, '%s', '%s',  '%s', '%s', '%s')" % (per[0], str(per[1]), str(per[2]), str(per[3]), str(per[4]), str(per[5]))
                perStatus = execSql(sql)
                if perStatus:
                    # updateCompany(tid, "")
                    return True
    return False


def getRegCert(staffId, rid, tid):
    '''
    //Personnel practice registration information
    :param staffId:
    :return:
    '''
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffDetail?staffId=" + str(staffId)
    html = getHtml(url)
    print("Obtain occupation registration information~")
    if html:
        data = ""
        try:
            data = html['data']
        except Exception as e:
            return
        regCert = ""
        try:
            regCert = data['regCertList'][0]
        except Exception as e:
            return
        staff = []
        staff.append(rid)
        try:
            staff.append(regCert['REG_TYPE_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(regCert['REG_PROF_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(regCert['QY_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(regCert['REG_CERTNO'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(regCert['CERT_REG_NO'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(tsToDate(regCert['REG_EDATE']))
        except Exception as e:
            staff.append("")
        print("Personnel occupation registration information:%s ~" % str(staff))
        sql = "insert jianshe_renyuans_zhiyes(rid, zhuce_leibie, zhuce_zhuanye, zhuce_danwei, zhengshu_bianhao, zhiye_yinzhanghao, youxiaoqi) values(%d, '%s', '%s', '%s', '%s', '%s', '%s')" % (int(staff[0]), str(staff[1]), str(staff[2]), str(staff[3]), str(staff[4]), str(staff[5]), str(staff[6]))
        staffStatus = execSql(sql)
        if staffStatus:
            updateCompany(tid, "renyuan_zizhi")
            return True
    print("No occupation registration information~")
    return False


def getStaff(staffId, rid, tid):
    '''
    //Basic information of personnel
    :param staffId:
    :return:
    '''
    url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffDetail?staffId=" + str(staffId)
    html = getHtml(url)
    print("Get basic information of personnel~")
    if html:
        data = ""
        try:
            data = html['data']
        except Exception as e:
            return
        staffMap = ""
        try:
            staffMap = data['staffMap']
        except Exception as e:
            return
        staff = []
        staff.append(rid)
        try:
            staff.append(staffMap['RY_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(staffMap['RY_SEX_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(staffMap['RY_CARDTYPE_NAME'])
        except Exception as e:
            staff.append("")
        try:
            staff.append(staffMap['IDCARD'])
        except Exception as e:
            staff.append("")
        print("Basic information of personnel: %s ~" % str(staff))
        sql = "insert jianshe_renyuans_basics(rid, xingming, xingbie, zhengjian_leixing, zhengjian_haoma) values(%d, '%s', '%s', '%s', '%s')" % (int(staff[0]), staff[1], staff[2], staff[3], staff[4])
        staffStatus = execSql(sql)
        if staffStatus:
            updateCompany(tid, "renyuan_jiben")
            return True
    print("No basic information of personnel~")
    return False


def getRandCompanyId():
    companyList = getCompanyList(0)
    try:
        return companyList[0]['QY_ID']
    except Exception as e:
        pass
    return


def qyueryMaxId():
    try:
        conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, host=mysql_host, port=int(mysql_port),
                               charset='utf8')
        cursor = conn.cursor()
        cursor.execute("select tid from jianshe_qiyes order by tid desc limit " + offset + "," + limit)
        try:
            return cursor.fetchall()[0][0]
        except Exception as e:
            return
    except Exception as e:
        return


def getRangeId():
    sql = "select tid from jianshe_xinxi_bulu order by tid desc limit " + offset + "," + limit
    res = querySql(sql)
    rangeId = []
    dateRangeList = []
    if res and len(res) > 0:
        metaDayRange = []
        before = 0
        for r in res:
            curr = int(r[0])
            if before == 0:
                metaDayRange.append(curr)
                before = curr
            else:
                dis = abs(curr - before)
                # print("%d - %d = %d" % (curr, before, dis))
                if dis > 100000:
                    if metaDayRange and len(metaDayRange) >= 2:
                        dateRangeList.append(list(set(metaDayRange)))
                    metaDayRange = []
                    before = 0
                else:
                    metaDayRange.append(curr)
                    before = curr
        if metaDayRange and len(metaDayRange) >= 2:
            dateRangeList.append(list(set(metaDayRange)))
    else:
        print("No matching range data found in database~")
        exit(0)
    if dateRangeList and len(dateRangeList) > 0:
        for dateRange in dateRangeList:
            dateRange.sort()
            nums = len(dateRange)
            for index, val in enumerate(dateRange):
                if nums - 1 == index:
                    continue
                next = dateRange[index + 1]
                if next - val > 1:
                    for i in range(val + 1, next):
                        rangeId.append(i)
    else:
        print("No matching range data found in database~")
        exit(0)
    return rangeId


def addCompany(tid, name):
    sql = "insert jianshe_xinxi_bulu(tid, qiye_ming, qiye_jiben, qiye_gongshang, qiye_lianxi, qiye_zizhi, qiye_renyuan, qiye_xiangmu, qiye_anxu, renyuan_jiben, renyuan_zizhi, renyuan_xiangmu, xiangmu_jiben, xiangmu_zhaobiao, xiangmu_shigongtu, xiangmu_hetong, xiangmu_shigongxuke, xiangmu_jungongyashou) values(%d, '%s', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)" % (int(tid), name)
    return execSql(sql)


def updateCompany(id, con):
    sql = "update jianshe_xinxi_bulu set " + str(con) + "=1 where tid = " + str(id)
    return execSql(sql)


def main():
    randtid = getRandCompanyId()
    rangId = getRangeId()
    for id in rangId:
        print("================================")
        QY_ID = encryptId("00" + str(id), randtid)
        print("Plaintext id: %s ciphertext id: %s ~" % ("00" + str(id), QY_ID))
        tid = int(decryptId(QY_ID))
        companyStatus = getCompanyDetail(QY_ID, tid)
        if companyStatus:
            # Enterprise qualification certificate information
            getCaDetailList(QY_ID, tid)
            # Enterprise personnel information
            getRegStaffList(QY_ID, tid)
        print("================================")


if __name__ == '__main__':
    main()

The code involves the response decryption, as well as the decryption algorithm of enterprise id and personnel id every day. Here, the code only provides learning communication, and will not be released.

It is busy in the near future, and there are few update development learning sharing, and there are development learning problems QQ click here Discuss with me

Wechat please scan the QR code below

Published 51 original articles, won praise 37, visited 40000+
Private letter follow

Posted by Tubby on Wed, 19 Feb 2020 08:16:37 -0800