python uses requests module to crawl the latest movie information in movie paradise.

Keywords: Python encoding Database JSON

Self-taught python for more than two months, accidentally got 14 videos of automated development of old boys, such as gem. I like Alex's chicken soup and Wu Sir's seriousness. I haven't finished reading it yet, and I have gained a lot.

Today I want to use my knowledge to crawl all the latest movie information in the movie paradise.

requests: Used to obtain web page information

re: Get the specific information you want on the web page

Beautifulsoup: Easy to search for tags and get the information you want

threading: Using multithreading dramatically reduces crawling time

queue: Use thread queues to write files (I actually put all the data in the database)

About these modules.

Welcome to your advice. (Think of Alex, or Wu Sir, as he sees it, making some suggestions)

# Author : 'n1celll'
import requests
import json
import re
from bs4 import BeautifulSoup
import threading
import queue,time

header = header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}

url = 'http://www.ygdy8.net/html/gndy/dyzz/index.html'

def get_page(url):
    index = resp = requests.get(url, headers=header)
    index.enconding = 'GBK'# Convert the encoding to HTML conformity
    t = index.text
    index_soup = BeautifulSoup(t, 'html.parser')# Converting the acquired web page information into a soup object
    all_pages = index_soup.find('select', attrs={'name': 'sldd'}).find_all('option')[-1] # Get the total number of pages
    page = int(all_pages.string)
    return page

def get_data(page):

    page_url = 'http://Www.ygdy8.net/html/gndy/dyzz/list_23_% s.html'%(page) Get data for each page
    print(page)
    # page_url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_30.html'
    res = requests.get(page_url, headers=header)
    res.encoding = 'GBK'  # 'gb2312'
    a = res.text
    soup = BeautifulSoup(a, 'html.parser')
    name = soup.find_all('a', attrs={'class': 'ulink'})
    # print(name) test
    for i in name:
        try:
            moive_name = re.search('<(.*?)(>|])', i.string).group()
            # There are two pits, one with a book title that the movie name is not used, and the other with two a tags.
        except:
            continue
        html = 'http://www.ygdy8.net' + i['href']
        da = requests.get(html, headers=header)
        da.encoding = 'GBK'  # da.apparent_encoding
        db = da.text
        # f = open('test2.txt','w',encoding='utf8')
        # f.write(a.text)
        # f.close()
        dr = BeautifulSoup(db, 'html.parser')
        span = dr.find('span', attrs={'style': 'FONT-SIZE: 12px'})
        if span:
            dc = span.text.split()

            data = ''
            for i in dc:
                data += i
            print(data)
            msg = {}
            if data:
                msg['mname'] = moive_name
                try:
                    show_t = re.search(r'(?<=(◎years|◎time|Age of products|Era]|Sowing time|Sowing:))(.*?)(?=◎|year|[)', data).group()
                except:
                    show_t = re.search(r'(?<=date|Part:)(.*?)(?=(-|drama))', data).group()
                msg['mtime'] = show_t
                try:
                    country = re.search(r'(?<=(◎Country|◎Place of Origin|◎region|◎different countries|Country]))(.*?)(?=◎|[class)', data).group()
                except:
                    try:
                        country = re.search(r'(?<=region)(.*?)(?=language)', data).group()
                    except:
                        country = 'Unknown'
                msg['mcountry'] = country
                try:
                    time = re.search(r'(?<=◎Film length|Length])(.*?)(?=◎|[)', data).group()
                except:
                    time = 'Unknown'
                msg['mtime'] = time
                try:
                    mtype = re.search(\
                        r'(?<=(◎category|Other types|Shadow type|◎type|Set type|◎classification|Type:|Category]|Slice type|Type:))(.*?)(?=(◎|level|[Produce|[To star))', \
                        data).group()
                except:
                    try:
                        mtype = re.search(r'(?<=type:)(.*?)(?=country)', data).group()
                    except:
                        mtype = re.search(r'action|Love|Warfare', data).group()
          #The above regular expressions feel awkward. I hope you have some technical advice.
                # with open('test4.txt','a+',encoding='utf8') as f: Test
                #     F.write ('% s:% s,% s,% s,% s,% s,% s n'% (moive_name, country, mtype, time, show_t)) test
                q.put('%s: %s,%s,%s,%s,%s\n' % (moive_name, country, mtype, time, show_t,html))

q = queue.Queue(maxsize=10000)
t_obj = []
lock = threading.Lock()#Add thread locks
# semaphore = threading.BoundedSemaphore(200)
def writing(f):
    # semaphore.acquire()
    data = q.get()
    lock.acquire()
    f.write(data)
    lock.release()
    # semaphore.release()
        # if not q.get():
        #     f.close()
        #     break
    # print('write complete')
all_page = get_page(url)
f = open('test4.txt', 'w', encoding='utf8')
print(all_page+1)

for i in range(1,all_page+1):
    t = threading.Thread(target=get_data,args=(i,))
    t.start()
    t_obj.append(t)
for t in t_obj:
    t.join()#Ensure that all threads start writing after they have finished
    print('%s over'%t)

while q.qsize():#Determine if there are elements in the queue
    w = threading.Thread(target=writing, args=(f,))
    w.start()
    w.join()
else:
    print('Write completion')

Posted by mrgrinch12 on Sat, 30 Mar 2019 19:45:30 -0700