Crawling recruitment information of liepin.com

Keywords: encoding MongoDB

Go to the homepage of liepin.com to get six classified websites
Enter six categories to get the website of each position
Go to the recruitment information website of each position to get the first recruitment information (if you need all or the first few, you need to modify the regular expression)
The code is stored in mongodb
If mongodb is not configured or not available
You don't need that function
The lines I annotated were saved as csv files

import requests 
import re
import time
import csv
import pymongo
from requests.exceptions import RequestException  
from lxml import etree
def pipei(url):#pick up information
    html=gethtml(url)
    pattern=re.compile('<ul class="sojob-list">.*?li>.*?<div class="sojob-item-main clearfix.*?div class="job-info".*?h3 title="(.*?)">.*?class="text-warning">(.*?)</span>.*?area.*?>(.*?)</.*?span class="edu">(.*?)</span>.*?an>(.*?)</span>.*?<div class="company-info nohover">.*?class="company-name">.*?a.*?>(.*?)</a>.*?<p class="field-financing">.*?an.*?a class="industry-link" href=.*?>(.*?)</a>',re.S)
    result=re.findall(pattern,html)
    if(len(result)>0):#No recruitment for some positions
        cuncu(result[0])
#        with open("data.csv","a",encoding='gb18030',newline='')as csvfile:
#            writer=csv.writer(csvfile)
#            writer.writerow(result[0])
#Position information print(result[0])
    else:
#        with open("data.csv","a",encoding='gb18030',newline='')as csvfile:
#            writer=csv.writer(csvfile)
#            writer.writerow(["null"])
        print("null")
def gethtml(url):
    try:
        response = requests.get(url)
        if response.status_code ==200:
            return response.text
    except:
        return gethtml(url)
def one_page(url):#Get the small categories in each big category
    html=etree.HTML(gethtml(url))
    result=html.xpath('//li/dl/dd/a[contains(@target,"_blank") and @rel="nofollow"]/@href')
    for i in range(len(result)):
        urll="https://www.liepin.com"+result[i]
#Website: Print ((re. Match (R '(. *?) DQS', urll. Group (0)))
        pipei((re.match(r'(.*?)dqs',urll).group(0)))
def shouye(url):#Access to six categories of websites
        pattern = re.compile('<a .*?target="_blank" href="https://www.liepin.com/(.*?)">')
        six=re.findall(pattern,gethtml(url))
        for i in six:
            wangzhi="https://www.liepin.com/"+i
            one_page(wangzhi)
def cuncu(result):
    client=pymongo.MongoClient(host='localhost',port=27017)
    db=client.test
    collection=db.jobs
    results={
        'Position':result[0],
        'salary':result[1],
        'area':result[2],
        'edu':result[3],
        'work time':result[4],
        'company':result[5],
        'company-type':result[6]
        }
    collection.insert_one(results)
def main():
#    with open("data.csv","a",encoding='gb18030',newline='')as csvfile:
#        writer=csv.writer(csvfile)
#        writer.writerow(["Position "," salary", "area", "education", "work time", "company", "company type"])
    url='https://www.liepin.com/'
    shouye(url)
if __name__ =='__main__':
    main()

Posted by Nicholas Reed on Mon, 11 Nov 2019 08:11:11 -0800