python crawler project actual combat, crawling user information, let you better filter

1. Import module

import urllib.request
 from bs4 import BeautifulSoup

2. Add header file to prevent link rejection during crawling

def qiuShi(url,page):
 ################### Simulate the behavior of high fidelity browser ##############
 # Set multiple header file parameters and simulate it as a high fidelity browser to crawl web pages
 heads ={
 'Connection':'keep-alive',
 'Accept-Language':'zh-CN,zh;q=0.9',
 'Accept':'text/html,application/xhtml+xml,application/xml;
 q=0.9,image/webp,image/apng,*/*;q=0.8',
 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 
 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
 }
 headall = []
 for key,value in heads.items():
 items = (key,value)
 # Add multiple header file parameters one by one to the header list
 headall.append(items)
 # print(headall)
 # print('test 1 -- ')
 
 # Create opener object
 opener = urllib.request.build_opener()
 # Add header file to opener object
 opener.addheaders = headall
 # Set opener object to global mode
 urllib.request.install_opener(opener)
 # Crawl the web page and read the data to data
 data = opener.open(url).read().decode()
 # data1 = urllib.request.urlopen(url).read().decode('utf-8')
 # print(data1)
 # print('test 2 -- ')
 ################## end ########################################

3. Create a soap parser object

soup = BeautifulSoup(data,'lxml')
 x = 0
4.Start using BeautifulSoup4 User name information extracted by parser
############### Get user name ########################
 name = []
 # Using bs4 parser to extract user name
 unames = soup.find_all('h2')
 # print('test 3--',unames)
 for uname in unames:
 # print(uname.get_text(), 'page', '-', str (x) + 'user name:', end = '')
 # Add user names one by one to the name list
 name.append(uname.get_text())
 # print(name)
 # print('test 4 -- ')
 #################end#############################

5.Extract published content information

Published content

cont = []
data4 = soup.find_all('div',class_='content')
# print(data4)
#Remember that the second filter should be converted to string form, otherwise an error will be reported
data4 = str(data4)
#Extract content using the bs4 parser
soup3 = BeautifulSoup(data4,'lxml')
contents = soup3.find_all('span')
for content in contents:
#print('content of 'article', x, 'embarrassment:', content.get_text())
#Add content one by one to the cont list
cont.append(content.get_text())
# print(cont)
#print('test 5 -- ')
##############end####################################

**6.Extract funny index**
 #################Funny index##########################
 happy = []
 # Get funny index
 # First filter
 data2 = soup.find_all('span',class_="stats-vote")
 # Get funny index
 # Second screening
 data2 = str(data2) # Convert list to string
 # print(data2)
 # print('test 6 -- ')
 soup1 = BeautifulSoup(data2,'lxml')
 happynumbers = soup1.find_all('i',class_="number")
 for happynumber in happynumbers:
 # print(happynumber.get_text())
 # Add funny numbers one by one to the happy list
 happy.append(happynumber.get_text())
 # print(happy)
 # print('test 7 -- ')
 ##################end#############################

If you like Python as much as I do and want to be an excellent programmer, you are also running on the road of learning python. Welcome to join the python learning group: Python group number: 491308659 verification code: Nanzhu
Every day, the group will share the latest industry information, share python free courses, and exchange learning together, so that learning becomes a habit!

7. Number of comments extracted

############## Comment number ############################
 comm = []
 data3 = soup.find_all('a',class_='qiushi_comments')
 data3 = str(data3)
 # print(data3)
 soup2 = BeautifulSoup(data3,'lxml')
 comments = soup2.find_all('i',class_="number")
 for comment in comments:
 # print(comment.get_text())
 # Add comments one by one to the comm list
 comm.append(comment.get_text())
 ############end#####################################

8. Use regular expression to extract gender and age

######## Access to gender and age ##########################
 # Using regular expressions to match gender and age
 pattern1 = '<div class="articleGender (w*?)Icon">(d*?)</div>'
 sexages = re.compile(pattern1).findall(data)
 # print(sexages)

9. Set the pattern setting of all user information output

################## Batch output all personal information of users #################
 print()
 for sexage in sexages:
 sa = sexage
 print('*'*17, '=_= The first', page, 'page-The first', str(x+1) + 'Individual users =_= ','*'*17)
 # Output user name
 print('[User name]:',name[x],end='')
 # Export gender and age
 print('[Gender:',sa[0],'  [Age:',sa[1])
 # Output content
 print('[Contents:',cont[x])
 # Output funny and comments
 print('[Funny index]:',happy[x],' [Comments]:',comm[x])
 print('*'*25,' 38th dividing line ','*'*25)
 x += 1
 ###################end##########################

10. Set cycle traversal to crawl 13 pages of user information

for i in range(1,14):
 # Website of embarrassing Encyclopedia
 url = 'https://www.qiushibaike.com/8hr/page/'+str(i)+'/'
 qiuShi(url,i)

Operation result, partial screenshot:

Posted by as22607 on Sun, 08 Dec 2019 11:36:59 -0800

Programmer Group

python crawler project actual combat, crawling user information, let you better filter

Published content

Hot Keywords