Python crawler learning - crawl the public service platform of construction market supervision
The code is only for learning and communication. Please do not use it for illegal purposes. In case of infringement, please click here Contact author delete
The code is only for learning and communication. Please do not use it for illegal purposes. In case of infringement, please click here Contact author delete
The code is only for learning and communication. Please do not use it for illegal purposes. In case of infringement, please click here Contact author delete
The code involves the response decryption, as well as the decryption algorithm of enterprise id and personnel id every day. Here, the code only provides learning communication, and will not be released.
It is busy in the near future, and there are few update development learning sharing, and there are development learning problems QQ click here Discuss with me
1, Code implementation
# -*- coding:utf-8 -*- import requests import re import hashlib from decrypter import decrypt, encryptId, decryptId import configparser import MySQLdb import time import random cf = configparser.ConfigParser() try: cf.read("config.ini") except Exception as e: print("Program directory does not exist config.ini configuration file~") exit(0) def getConf(sec, key): try: return cf.get(sec, key) except Exception as e: print("The following configuration is not available:" + sec + " - " + key) exit(0) # ------------------------------------------------- offset = str(getConf("app-sys", "offset")) limit = str(getConf("app-sys", "limit")) # Database account mysql_user = getConf("Mysql-Database", "user") # Database password mysql_password = getConf("Mysql-Database", "password") # Database name mysql_database = getConf("Mysql-Database", "database") mysql_host = getConf("Mysql-Database", "host") mysql_port = getConf("Mysql-Database", "port") # token token = getConf("web-param", "token") min_sleep = int(getConf("app-sys", "min_sleep")) max_sleep = int(getConf("app-sys", "max_sleep")) timeout = 20 retry = 3 headers = { "Referer": "http://jzsc.mohurd.gov.cn/data/company", "timeout": "30000", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0", "accessToken": token } def getSleepTime(): return float(random.randint(min_sleep, max_sleep) / 1000) def execSql(sql): try: try: conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, port=int(mysql_port), database=mysql_database, charset='utf8') cursor = conn.cursor() cursor.execute(sql) conn.commit() return True except Exception as e: pass except Exception as e: pass return False def querySql(sql): try: try: conn = MySQLdb.connect(user=mysql_user, password=mysql_password, host=mysql_host, port=int(mysql_port), database=mysql_database, charset='utf8') cursor = conn.cursor() cursor.execute(sql) return cursor.fetchall() except Exception as e: pass except Exception as e: pass return def getHash(s): m = hashlib.md5() m.update(s.encode("utf-8")) return m.hexdigest() def updateToken(url): global token global headers global cf while True: try: cf.read("config.ini") except Exception as e: print("Program directory does not exist config.ini configuration file~") exit(0) token = getConf("web-param", "token") print("token Invalid, please replace~\n at present token: %s" % token) time.sleep(5) headers = { "Referer": "http://jzsc.mohurd.gov.cn/data/company", "timeout": "30000", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0", "accessToken": token } try: resp = requests.get(url, headers=headers, timeout=timeout) content = resp.content.decode("utf-8") content = decrypt(content) code = str(content['code']) if code == "200": print("token Update success~") return except Exception as e: pass def getHtml(url): for i in range(retry): # print("url: %s times: %d " % (url, i + 1)) try: resp = requests.get(url, headers=headers, timeout=timeout) content = resp.content.decode("utf-8") content = decrypt(content) code = str(content['code']) if code == "408": updateToken(url) continue time.sleep(getSleepTime()) return content except Exception as e: pass return def getCompanyList(page): url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/list?pg=" + str(page) + "&pgsz=15&total=450" html = getHtml(url) if html: try: return html['data']['list'] except Exception as e: pass return def getAreas(s): req = re.compile("(.*)-(.*)") try: return re.findall(req, s)[0] except Exception as e: pass return def tsToDate(ts): tsList = list(str(ts)) tsList.insert(-3, ".") cuttTs = float("".join(tsList)) time_local = time.localtime(cuttTs) return time.strftime("%Y{y}%m{m}%d", time_local).format(y='-', m='-') def getCompanyDetail(id, tid): url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/compDetail?compId=" + id html = getHtml(url) print("Access to basic information of the enterprise~") if html: try: compMap = html['data']['compMap'] print("Company name:%s " % compMap['QY_NAME']) addStatus = addCompany(tid, compMap['QY_NAME']) if addStatus: company = [] company.append(tid) try: company.append(compMap['QY_ORG_CODE']) except Exception as e: company.append("") try: company.append(compMap['QY_FR_NAME']) except Exception as e: company.append("") QY_REGION_NAME = "" try: QY_REGION_NAME = getAreas(compMap['QY_REGION_NAME']) except Exception as e: pass try: company.append(QY_REGION_NAME[0]) except Exception as e: company.append("") try: company.append(QY_REGION_NAME[1]) except Exception as e: company.append("") try: company.append(compMap['QY_ADDR']) except Exception as e: company.append("") try: company.append(compMap['QY_NAME']) except Exception as e: company.append("") try: company.append(compMap['QY_GSZCLX_NAME']) except Exception as e: company.append("") print("Basic information of the enterprise:%s ~" % str(company)) companyStatus = execSql("insert jianshe_qiyes(tid, hao, ren, diqu, city, qiye_dizhi, qiye_ming, qiye_leixing) values(%d, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (int(company[0]), company[1], company[2], company[3], company[4], company[5], company[6], company[7])) if companyStatus: upStatus = updateCompany(tid, "qiye_jiben") return True return False except Exception as e: pass print("The enterprise does not exist~") return False def getCaDetailList(id, tid): ''' //Enterprise qualification certificate information :param id: :return: ''' url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/caDetailList?qyId=" + id + "&pg=0&pgsz=15" html = getHtml(url) print("Obtain enterprise qualification certificate information~") if html: data = "" try: data = html['data']['pageList']['list'] for _ in data: ca = [] ca.append(tid) try: ca.append(getHash(str(tid) + _['APT_NAME'])) except Exception as e: ca.append("") try: ca.append(_['APT_CERTNO']) except Exception as e: ca.append("") try: ca.append(_['APT_NAME']) except Exception as e: ca.append("") try: ca.append(_['APT_TYPE_NAME']) except Exception as e: ca.append("") try: ca.append(tsToDate(_['APT_GET_DATE'])) except Exception as e: ca.append("") try: ca.append(tsToDate(_['APT_EDATE'])) except Exception as e: ca.append("") try: ca.append(_['APT_GRANT_UNIT']) except Exception as e: ca.append("") sql = "insert jianshe_zizhis(tid, `hash`, hao, ming, leibie, riqi, youxiaoqi, jiguan) values(%d, '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % (int(ca[0]), ca[1], ca[2], ca[3], ca[4], ca[5], ca[6], ca[7]) caStatus = execSql(sql) if caStatus: updateCompany(tid, "qiye_zizhi") return True return False except Exception as e: print("No enterprise qualification information~") return print("No enterprise qualification information~") return def getRegStaffList(id, tid): ''' //Enterprise personnel information :param id: :return: ''' url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/regStaffList?qyId=" + str(id) + "&pg=0&pgsz=15" html = getHtml(url) print("Get new information of enterprise personnel~") if html: try: data = "" try: data = html['data']['pageList']['list'] except Exception as e: return for _ in data: reg = [] reg.append(tid) rid = "" try: rid = int(decryptId(_['RY_ID'])) reg.append(rid) except Exception as e: reg.append("") try: reg.append(_['REG_SEAL_CODE']) except Exception as e: reg.append("") try: reg.append(_['RY_NAME']) except Exception as e: reg.append("") try: reg.append(_['IDCARD']) except Exception as e: reg.append("") try: reg.append(_['REG_TYPE_NAME']) except Exception as e: reg.append("") try: reg.append(_['REG_PROF_NAME']) except Exception as e: reg.append("") print("Enterprise personnel information:%s ~" % str(reg)) sql = "insert jianshe_qiyes_renyuans(tid, rid, hao, xingming, shenfengzheng, leibie, zhuanye) values(%d, %d, '%s', '%s', '%s', '%s', '%s')" % (reg[0], reg[1], reg[2], reg[3], reg[4], reg[5], reg[6]) regStatus = execSql(sql) if regStatus: updateCompany(tid, "qiye_renyuan") # Basic information of personnel getStaff(_['RY_ID'], rid, tid) # Personnel practice registration information getRegCert(_['RY_ID'], rid, tid) # getRegCert("D2D2D3D4D3D2D2DAD3DBD2D1D1D1D3DAD4D6", rid, tid) return except Exception as e: pass print("No enterprise personnel information~") return def staffPerformanceListSys(id, rid, tid): url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffPerformanceListSys?staffId=" + str(id) + "&pg=0" resp = getHtml(url) if resp: data = "" try: data = resp['data']['pageList']['list'] except Exception as e: return if data and len(data) > 0: for _ in data: per = [] per.append(int(rid)) try: per.append(_['PRJNUM']) except Exception as e: per.append("") try: per.append(_['PRJNAME']) except Exception as e: per.append("") try: per.append(_['PROVINCE'] + _['CITY']) except Exception as e: per.append("") try: per.append(_['PRJTYPENUM']) except Exception as e: per.append("") try: per.append(_['BUILDCORPNAME']) except Exception as e: per.append("") sql = "insert jianshe_renyuans_yejis(rid, xiangmu_bianhao, xiangmu_mingcheng, xiangmu_shudi, xiangmu_leibie, jianshe_danwei) values(%d, '%s', '%s', '%s', '%s', '%s')" % (per[0], str(per[1]), str(per[2]), str(per[3]), str(per[4]), str(per[5])) perStatus = execSql(sql) if perStatus: # updateCompany(tid, "") return True return False def getRegCert(staffId, rid, tid): ''' //Personnel practice registration information :param staffId: :return: ''' url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffDetail?staffId=" + str(staffId) html = getHtml(url) print("Obtain occupation registration information~") if html: data = "" try: data = html['data'] except Exception as e: return regCert = "" try: regCert = data['regCertList'][0] except Exception as e: return staff = [] staff.append(rid) try: staff.append(regCert['REG_TYPE_NAME']) except Exception as e: staff.append("") try: staff.append(regCert['REG_PROF_NAME']) except Exception as e: staff.append("") try: staff.append(regCert['QY_NAME']) except Exception as e: staff.append("") try: staff.append(regCert['REG_CERTNO']) except Exception as e: staff.append("") try: staff.append(regCert['CERT_REG_NO']) except Exception as e: staff.append("") try: staff.append(tsToDate(regCert['REG_EDATE'])) except Exception as e: staff.append("") print("Personnel occupation registration information:%s ~" % str(staff)) sql = "insert jianshe_renyuans_zhiyes(rid, zhuce_leibie, zhuce_zhuanye, zhuce_danwei, zhengshu_bianhao, zhiye_yinzhanghao, youxiaoqi) values(%d, '%s', '%s', '%s', '%s', '%s', '%s')" % (int(staff[0]), str(staff[1]), str(staff[2]), str(staff[3]), str(staff[4]), str(staff[5]), str(staff[6])) staffStatus = execSql(sql) if staffStatus: updateCompany(tid, "renyuan_zizhi") return True print("No occupation registration information~") return False def getStaff(staffId, rid, tid): ''' //Basic information of personnel :param staffId: :return: ''' url = "http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/staff/staffDetail?staffId=" + str(staffId) html = getHtml(url) print("Get basic information of personnel~") if html: data = "" try: data = html['data'] except Exception as e: return staffMap = "" try: staffMap = data['staffMap'] except Exception as e: return staff = [] staff.append(rid) try: staff.append(staffMap['RY_NAME']) except Exception as e: staff.append("") try: staff.append(staffMap['RY_SEX_NAME']) except Exception as e: staff.append("") try: staff.append(staffMap['RY_CARDTYPE_NAME']) except Exception as e: staff.append("") try: staff.append(staffMap['IDCARD']) except Exception as e: staff.append("") print("Basic information of personnel: %s ~" % str(staff)) sql = "insert jianshe_renyuans_basics(rid, xingming, xingbie, zhengjian_leixing, zhengjian_haoma) values(%d, '%s', '%s', '%s', '%s')" % (int(staff[0]), staff[1], staff[2], staff[3], staff[4]) staffStatus = execSql(sql) if staffStatus: updateCompany(tid, "renyuan_jiben") return True print("No basic information of personnel~") return False def getRandCompanyId(): companyList = getCompanyList(0) try: return companyList[0]['QY_ID'] except Exception as e: pass return def qyueryMaxId(): try: conn = MySQLdb.connect(user=mysql_user, password=mysql_password, database=mysql_database, host=mysql_host, port=int(mysql_port), charset='utf8') cursor = conn.cursor() cursor.execute("select tid from jianshe_qiyes order by tid desc limit " + offset + "," + limit) try: return cursor.fetchall()[0][0] except Exception as e: return except Exception as e: return def getRangeId(): sql = "select tid from jianshe_xinxi_bulu order by tid desc limit " + offset + "," + limit res = querySql(sql) rangeId = [] dateRangeList = [] if res and len(res) > 0: metaDayRange = [] before = 0 for r in res: curr = int(r[0]) if before == 0: metaDayRange.append(curr) before = curr else: dis = abs(curr - before) # print("%d - %d = %d" % (curr, before, dis)) if dis > 100000: if metaDayRange and len(metaDayRange) >= 2: dateRangeList.append(list(set(metaDayRange))) metaDayRange = [] before = 0 else: metaDayRange.append(curr) before = curr if metaDayRange and len(metaDayRange) >= 2: dateRangeList.append(list(set(metaDayRange))) else: print("No matching range data found in database~") exit(0) if dateRangeList and len(dateRangeList) > 0: for dateRange in dateRangeList: dateRange.sort() nums = len(dateRange) for index, val in enumerate(dateRange): if nums - 1 == index: continue next = dateRange[index + 1] if next - val > 1: for i in range(val + 1, next): rangeId.append(i) else: print("No matching range data found in database~") exit(0) return rangeId def addCompany(tid, name): sql = "insert jianshe_xinxi_bulu(tid, qiye_ming, qiye_jiben, qiye_gongshang, qiye_lianxi, qiye_zizhi, qiye_renyuan, qiye_xiangmu, qiye_anxu, renyuan_jiben, renyuan_zizhi, renyuan_xiangmu, xiangmu_jiben, xiangmu_zhaobiao, xiangmu_shigongtu, xiangmu_hetong, xiangmu_shigongxuke, xiangmu_jungongyashou) values(%d, '%s', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)" % (int(tid), name) return execSql(sql) def updateCompany(id, con): sql = "update jianshe_xinxi_bulu set " + str(con) + "=1 where tid = " + str(id) return execSql(sql) def main(): randtid = getRandCompanyId() rangId = getRangeId() for id in rangId: print("================================") QY_ID = encryptId("00" + str(id), randtid) print("Plaintext id: %s ciphertext id: %s ~" % ("00" + str(id), QY_ID)) tid = int(decryptId(QY_ID)) companyStatus = getCompanyDetail(QY_ID, tid) if companyStatus: # Enterprise qualification certificate information getCaDetailList(QY_ID, tid) # Enterprise personnel information getRegStaffList(QY_ID, tid) print("================================") if __name__ == '__main__': main()
The code involves the response decryption, as well as the decryption algorithm of enterprise id and personnel id every day. Here, the code only provides learning communication, and will not be released.
It is busy in the near future, and there are few update development learning sharing, and there are development learning problems QQ click here Discuss with me
Wechat please scan the QR code below
Published 51 original articles, won praise 37, visited 40000+