Go to the homepage of liepin.com to get six classified websites
Enter six categories to get the website of each position
Go to the recruitment information website of each position to get the first recruitment information (if you need all or the first few, you need to modify the regular expression)
The code is stored in mongodb
If mongodb is not configured or not available
You don't need that function
The lines I annotated were saved as csv files
import requests import re import time import csv import pymongo from requests.exceptions import RequestException from lxml import etree def pipei(url):#pick up information html=gethtml(url) pattern=re.compile('<ul class="sojob-list">.*?li>.*?<div class="sojob-item-main clearfix.*?div class="job-info".*?h3 title="(.*?)">.*?class="text-warning">(.*?)</span>.*?area.*?>(.*?)</.*?span class="edu">(.*?)</span>.*?an>(.*?)</span>.*?<div class="company-info nohover">.*?class="company-name">.*?a.*?>(.*?)</a>.*?<p class="field-financing">.*?an.*?a class="industry-link" href=.*?>(.*?)</a>',re.S) result=re.findall(pattern,html) if(len(result)>0):#No recruitment for some positions cuncu(result[0]) # with open("data.csv","a",encoding='gb18030',newline='')as csvfile: # writer=csv.writer(csvfile) # writer.writerow(result[0]) #Position information print(result[0]) else: # with open("data.csv","a",encoding='gb18030',newline='')as csvfile: # writer=csv.writer(csvfile) # writer.writerow(["null"]) print("null") def gethtml(url): try: response = requests.get(url) if response.status_code ==200: return response.text except: return gethtml(url) def one_page(url):#Get the small categories in each big category html=etree.HTML(gethtml(url)) result=html.xpath('//li/dl/dd/a[contains(@target,"_blank") and @rel="nofollow"]/@href') for i in range(len(result)): urll="https://www.liepin.com"+result[i] #Website: Print ((re. Match (R '(. *?) DQS', urll. Group (0))) pipei((re.match(r'(.*?)dqs',urll).group(0))) def shouye(url):#Access to six categories of websites pattern = re.compile('<a .*?target="_blank" href="https://www.liepin.com/(.*?)">') six=re.findall(pattern,gethtml(url)) for i in six: wangzhi="https://www.liepin.com/"+i one_page(wangzhi) def cuncu(result): client=pymongo.MongoClient(host='localhost',port=27017) db=client.test collection=db.jobs results={ 'Position':result[0], 'salary':result[1], 'area':result[2], 'edu':result[3], 'work time':result[4], 'company':result[5], 'company-type':result[6] } collection.insert_one(results) def main(): # with open("data.csv","a",encoding='gb18030',newline='')as csvfile: # writer=csv.writer(csvfile) # writer.writerow(["Position "," salary", "area", "education", "work time", "company", "company type"]) url='https://www.liepin.com/' shouye(url) if __name__ =='__main__': main()