In 10 minutes, we use python to realize the English name of a given movie and climb to the Chinese name and box office on the cat's eye.

Keywords: Python Windows Selenium Programming

&

[root@xxn maoyan]# cat cat.py
#!/usr/bin/env python
#coding:utf-8

import requests
from bs4 import BeautifulSoup

def movieurl(url):
    """
    //One-page url address for getting movies
    """
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36",
    }
    response = requests.get(url,headers=headers,timeout=10)
    soup= BeautifulSoup(response.text,'lxml')
    href = soup.find_all('div',class_="channel-detail movie-item-title")[0]
    movieurl = "http://maoyan.com%s" % href.find('a')['href']
    return movieurl

def moveinfo(url):
    """
    //Get the movie's Chinese name, box office unit.
    //If the box office unit does not have data, it means that the box office is "temporarily not available".
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.3964.2 Safari/537.36",
    }
    response = requests.get(url, headers=headers,timeout=5)
    soup = BeautifulSoup(response.text, 'lxml')
    Chinesename = soup.find('div',class_="movie-brief-container").h3.string
    try:
        boxofficeunit = soup.find_all('div',class_="movie-index-content box")[0].find('span',class_='unit').string
    except:
        boxofficeunit = 0
    return Chinesename,boxofficeunit

if __name__ == '__main__':
    Moviename = input("Please enter the English name of the movie:")
    Moviename = Moviename.replace(' ','+')
    url = "http://maoyan.com/query?kw=%s&type=0" % Moviename
    Chinesename, boxofficeunit = moveinfo(movieurl(url))
    print Chinesename,boxofficeunit

&

[root@xxn maoyan]# cat maoyan.py
#!/usr/bin/env python
# coding=utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import random
from PIL import Image
import pytesseract
import os
import cat

def imagedownlod(url):
    """
    //Save a screenshot of a single page of a movie, because we need to get box office data, so we don't load pictures to speed up the process.
    """
    dcap = dict(DesiredCapabilities.PHANTOMJS)
    USER_AGENTS=[
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4882.400 QQBrowser/9.7.13059.400'
    ]
    #Choose a browser header randomly from the USER_AGENTS list to disguise the browser
    dcap["phantomjs.page.settings.userAgent"] = (random.choice(USER_AGENTS))
    driver = webdriver.PhantomJS(desired_capabilities=dcap)
    # Without loading pictures, page crawling will be much faster
    dcap["phantomjs.page.settings.loadImages"] = False # Disable loading of pictures
    driver = webdriver.PhantomJS(desired_capabilities=dcap)
    driver.set_window_size(1366, 3245)
    driver.get(url)
    driver.save_screenshot("maoyan.png")

def crop_image(image_path,crop_path):
    """
    //Originally, we wanted to use webdriver to get the location of box office elements, and then calculate four parameters according to the location and size of the elements. The location can be obtained normally, but the size of the picture is different, so the matting will be problematic.
    //So change the way: I change the screenshots of each page to a uniform size, because the box office is fixed, so this can make the crawler stronger.
    """
    # Calculating Absolute Coordinates of Cut-out Area
    left = 668
    top = 388
    right = 668+158
    bottom = 388+54
    # Open the picture, extract the corresponding area and store it.
    img = Image.open(image_path)
    out = img.resize((1366, 3245),Image.ANTIALIAS) #resize image with high-quality
    out.save('maoyannew.png')
    im = Image.open('maoyannew.png')
    im = im.crop((left, top, right, bottom))
    im.save(crop_path)
    os.remove('maoyannew.png')

def words(image):
    """
    //Because we normalize pictures of different sizes, some pictures pytesseract can't recognize numbers.
    //So I do gray processing first, and then use the parameter config="-psm 8-c tessedit_char_whitelist=1234567890".
    """
    im = Image.open(image).convert('L')
    im.save(image)
    number =  pytesseract.image_to_string(Image.open(image),config="-psm 8 -c tessedit_char_whitelist=1234567890")
    os.remove(image)
    return number

if __name__ == '__main__':
    Moviename = input("Please enter the English name of the movie:")
    Moviename = Moviename.replace(' ','+')
    url = "http://maoyan.com/query?kw=%s&type=0" % Moviename
    Chinesename,boxofficeunit = cat.moveinfo(cat.movieurl(url))
    imagedownlod(cat.movieurl(url))
    crop_image('maoyan.png','piaofang.png')
    print words('piaofang.png')
    os.remove('maoyan.png')

&

[root@xxn maoyan]# cat catseye.py 
#!/usr/bin/env python
# coding=utf-8
import cat
import maoyan
import sys
import os
reload(sys)
sys.setdefaultencoding('utf8')
def main():
    moviename = input("Please enter the English name of the movie:")
    Moviename = moviename.replace(' ','+')
    Moviename = moviename.replace(':','%3A')
    url = "http://maoyan.com/query?kw=%s&type=0" % Moviename
    Chinesename,boxofficeunit = cat.moveinfo(cat.movieurl(url))
    if boxofficeunit == 0:
        """
        //If the box office unit is zero, that is, nonexistent, then the movie box office is temporary, so we do not need to dig to identify the number.
        """
        print "The English name of the movie you searched for:" + moviename
        print "The Chinese name of the movie you searched for:" +  Chinesename
        print "The box office you are searching for:" + 'No time'
    else:
        maoyan.imagedownlod(cat.movieurl(url))
        maoyan.crop_image('maoyan.png','piaofang.png')
        number = maoyan.words('piaofang.png')
        print "The English name of the movie you searched for:" + moviename
        print "The Chinese name of the movie you searched for:" +  Chinesename
        print "The box office you are searching for:" + str(number2) + str(boxofficeunit)
        os.remove('maoyan.png')
if __name__ == '__main__':
    main()

If you are still confused in the world of programming, you can join our Python learning button qun: 784758214 to see how our predecessors learned! Exchange experience! I am a senior Python development engineer, from basic Python scripts to web development, crawler, django, data mining, etc. To every little friend of Python! Share some learning methods and small details that need attention. Click to join us. python learner gathering place

Test:

Posted by stukov on Sat, 05 Oct 2019 19:51:18 -0700