Python crawler battle one: crawling to get Encyclopedia of embarrassing stories

Keywords: Python Windows less


Now the regular expression is explained here

1). *? Is a fixed collocation. And * represent that any infinite number of characters can be matched, plus "*"? It means to use non greedy pattern to match, that is, we will match as short as possible, and we will use a lot of. *? Matching in the future.

2) (. *?) represents a group. In this regular expression, we match five groups. In the following traversal item, item[0] represents the content of the first (. *?), item[1] represents the content of the second (. *?), and so on.

3) the re.S flag represents any point matching mode when matching, and point. Can also represent line break.

In this way, we get the publisher, release time, release content, additional pictures and likes.

Note here that if the content we want to get is with pictures, it's cumbersome to output it directly, so here we only get the paragraphs without pictures.

So, here we need to filter the segments with pictures.

We can find that the segment with picture will have the following code, while the segment without picture will not. Therefore, the item[3] of our regular expression gets the following content. If there is no picture, the item[3] gets the empty content.

<div class="thumb">
<a href="/article/112061287?list=hot&amp;s=4794990" target="_blank">
< img SRC = " JPG" ALT = "but they are still optimistic" >

So we just need to judge whether the item[3] contains img tags.

The complete code of this article is as follows:

  1 _author__ = 'biao'
  2 # _*_ coding:utf-8 _*_ 
  3 import urllib.request
  4 #import urllib3.request
  5 import re
  6 import _thread
  7 import time
  9 #Embarrassing encyclopedia reptiles
 10 class QSBK:
 11     #Initialization method, defining some variables
 12     def __init__(self):
 13         self.pageIndex = 1
 14         self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
 15         #Initialization headers
 16         self.headers = { 'User-Agent' : self.user_agent }
 17         #The variables that store the segments. Each element is the segments on each page
 18         self.stories = []
 19         #Variables to store whether the program continues to run
 20         self.enable = False
 21     #Get the page code by passing in the index of a page
 22     def getPage(self,pageIndex):
 23         try:
 24             url = '' + str(pageIndex)
 25             #Build requested request
 26             request = urllib.request.Request(url,headers = self.headers)
 27             #utilize urlopen Get page code
 28             response = urllib.request.urlopen(request)
 29             #Convert page to UTF-8 Code
 30             pageCode ='utf-8')
 31             return pageCode
 33         except urllib.error.URLError as e:
 34             if hasattr(e,"reason"):
 35                 print(u"Failed to connect to Encyclopedia of embarrassing events,Error reason",e.reason)
 36                 return None
 37      #Pass in a page code to return the segment sublist without picture on this page
 38     def getPageItems(self,pageIndex):
 39         pageCode = self.getPage(pageIndex)
 40         if not pageCode:
 41             print("Page load failed....")
 42             return None
 43         pattern = re.compile('<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?'+
 44                          'content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',re.S)
 45         items = re.findall(pattern,pageCode)
 46         #Used to store every page's paragraphs
 47         pageStories = []
 48         #Information of traversal regular expression matching
 49         for item in items:
 50             #Include pictures or not
 51             haveImg ="img",item[3])
 52             #If not, add it list in
 53             if not haveImg:
 54                 replaceBR = re.compile('<br/>')
 55                 text = re.sub(replaceBR,"\n",item[1])
 56                 #item[0]Is the publisher of a paragraph, item[1]It's content. item[2]Is the release time,item[4]It's a point of praise.
 57                 pageStories.append([item[0].strip(),text.strip(),item[2].strip(),item[4].strip()])
 58         return pageStories
 61      #Load and extract the content of the page and add it to the list
 62     def loadPage(self):
 63         #If the number of pages not currently viewed is less than 2, load a new page
 64         if self.enable == True:
 65             if len(self.stories) < 2:
 66                 #Get a new page
 67                 pageStories = self.getPageItems(self.pageIndex)
 68                 #Save the segment of this page to the global list in
 69                 if pageStories:
 70                     self.stories.append(pageStories)
 71                     #Add one to the index of the page number after obtaining, indicating that the next page will be read next time
 72                     self.pageIndex += 1
 74     #Call this method to print one segment each time you hit enter
 75     def getOneStory(self,pageStories,page):
 76         #Segments traversing a page
 77         for story in pageStories:
 78             #Waiting for user input
 79             _input = input()
 80             #Every time you enter a carriage return, determine whether you want to load a new page
 81             self.loadPage()
 82             #If input Q Then the program ends
 83             if _input == "Q":
 84                 self.enable = False
 85                 return
 86             print(u"The first%d page\t Publisher:%s\t Release time:%s\t Fabulous:%s\n%s" %(page,story[0],story[2],story[3],story[1]))
 88     #Starting method
 89     def start(self):
 90         print(u"Reading the Encyclopedia of embarrassing things,Press enter to view the new paragraph, Q Sign out")
 91         #Making variables True,The program can run normally
 92         self.enable = True
 93         #Load one page first
 94         self.loadPage()
 95         #Local variable, control current page read
 96         nowPage = 0
 97         while self.enable:
 98             if len(self.stories)>0:
 99                 #From the overall situation list Get a page's paragraph from
100                 pageStories = self.stories[0]
101                 #Current number of pages read plus one
102                 nowPage += 1
103                 #Global list The first element in is deleted because it has been removed
104                 del self.stories[0]
105                 #Output the segment of this page
106                 self.getOneStory(pageStories,nowPage)
109 spider = QSBK()
110 spider.start()
113 '''
114 def run_demo():
115     f=urllib.request.urlopen('')
116     print(
118 if __name__=='__main__':
119     run_demo()
120 '''
122 '''
123 page = 1
124 url = '' + str(page)
125 user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
126 headers = { 'User-Agent' : user_agent }
127 try:
128     request = urllib.request.Request(url,headers = headers)
129     response = urllib.request.urlopen(request)
130     content ='utf-8')
131     pattern = re.compile('<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?'+
132                          'content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',re.S)
133     items = re.findall(pattern,content)
134     for item in items:
135         haveImg ="img",item[3])
136         if not haveImg:
137             print(item[0],item[1],item[2],item[4])
138    # print(
139 except urllib.error.HTTPError as e:
140     if hasattr(e,"code"):
141         print(e.code)
142     if hasattr(e,"reason"):
143         print(e.reason)
144 '''

Posted by dcace on Tue, 10 Dec 2019 10:24:00 -0800