Python crawler, you can easily parse the page results by using beautiful soup. Here is the position information of the boss page using this method: including position name, salary, location, company name, company financing and other information. With this example, you can easily see how to use beautifulsop.
import requests from bs4 import BeautifulSoup from middlewares import get_random_proxy,get_random_agent import time class Boss_Spider(object): def __init__(self, page=3): self.proxies = [] self.verify_pro = [] self.page = page self.headers = {} #Step 1: get all recruitment links on the homepage def Parse_pre(self): base_url = 'https://www.zhipin.com/' headers = get_random_agent() proxy = get_random_proxy() time.sleep(1) resp = requests.get(base_url, headers=headers) if resp.status_code == 200: soup = BeautifulSoup(resp.text, 'lxml') for job_menu in soup.find_all(class_='menu-sub'): for li in job_menu.find_all('li'): job_type = li.find('h4').get_text() for job_list in li.find_all('a'): job_sub = job_list.get_text() job_uri = job_list['href'] for i in range(0,11): job_url = base_url + job_uri + '?page=%d&ka=page-%d' %(i,i) requests.get(job_url,headers=headers,proxies=proxy) meta = { 'job_type': job_type, 'job_sub': job_sub, } self.Parse_index(meta=meta,url=job_url) #Crawling specific page data def Parse_index(self,meta,url): headers = get_random_agent() proxy = get_random_proxy() time.sleep(1) resp = requests.get(url, headers=headers) if resp.status_code == 200: soup = BeautifulSoup(resp.text, 'lxml') print(soup) for li in soup.find(class_='job-list').find_all('li'): print('###########') position = li.find(class_='job-title').get_text() salary = li.find(class_='red').get_text() add = li.find('p').get_text() need = li.find('p').find('em').get_text() company_name = li.find(class_='company-text').find('a').get_text() tag = li.find(class_='company-text').find('p') print(position,"$$$",salary,"$$$",add,"$$$",need,"$$$",company_name,"$$$",tag) if __name__ == '__main__': b = Boss_Spider() b.Parse_pre()
The operation output is as follows:
Back end development $$$15-30K $$$3-5 years undergraduate course in Chaoyang District, Beijing
###########
Backend Development Engineer $$$35-55K $$$Wangjing experience in Chaoyang District, Beijing is not limited to undergraduates $$$$$cloud account $$$mobile Internet round C 100-499 people
###########