Remember a CSS anti crawl

Keywords: Python JSON xml

Target website: Cat eye movie

Main process

  1. Crawl the url corresponding to each movie
  2. Crawl the source code corresponding to the specific movie
  3. Analyze the source code and download the corresponding font
  4. Use fontTools to draw the corresponding number
  5. Using machine learning method to identify corresponding numbers
  6. Replace the corresponding place with the identified number in the source code

Pit experience

  • When using pyquery's. text() method, the html is automatically de escaped. The replacement process fails. A heap of garbled code is printed directly. lxml must not be used instead
  • At first, I saw a lot of Chinese characters on the Internet by converting woff font file to xml and analyzing glyf in xml, but after reading several, I found that even the x and y values corresponding to the same number are not exactly the same
  • Not familiar with the use of multithreading

The list of items is as follows

The specific code is as follows

Myspider.py
import requests
import fake_useragent
import re
import os
from woffToPng import woff_to_image
from resizeImage import resize_img
from lxml import etree
from html import unescape
from ThreadClass import SpiderThread
from SaveMovieData import SaveInfo
from pyquery import PyQuery as pq
from pprint import pprint
import json
import time


# It's a bit hard to implement with a co program. Try multithreading for a while

# Storage urls
film_urls = []


def verify_img(img_dir):
    api_url = "http://127.0.0.1:6000/b"
    img_to_num = {}
    for file in os.listdir(img_dir):
        file_name = os.path.join(img_dir, file)
        # Rebuild picture size
        resize_img(file_name, file_name)
        files = {"image_file": ("image_file", open(file_name, "rb"), "application")}
        r = requests.post(url=api_url, files=files, timeout=None)
        if r.status_code == 200:
            # Get the name of the picture, that is, the unicode code corresponding to the number
            num_id = os.path.splitext(file)[0][3:]
            img_to_num[str(int(num_id, 16))] = r.json().get("value")
    return img_to_num


def find_certain_part(html, xpath_format):
    try:
        return html.xpath(xpath_format)[0]
    except Exception:
        return "null"

def parse_data_by_lxml(source_code, img_to_num, saver):
    html = etree.HTML(source_code)

    xpaths = json.loads(open("somexpaths.json", "r").read())

    movie_name = find_certain_part(html, xpaths.get("movie_name"))
    movie_ename = find_certain_part(html, xpaths.get("movie_ename"))
    movie_classes = find_certain_part(html, xpaths.get("movie_classes")).strip()
    movie_length = find_certain_part(html, xpaths.get("movie_length")).strip()
    movie_showtime = find_certain_part(html, xpaths.get("movie_showtime")).strip()


    text_pattern = re.compile('.*?class="stonefont">(.*?)</span>')
    data_to_be_replace = []

    movie_score = find_certain_part(html, xpaths.get("movie_score"))
    movie_score_num = find_certain_part(html, xpaths.get("movie_score_num"))
    if movie_score != "null":
        movie_score = text_pattern.search(etree.tostring(movie_score).decode("utf8")).group(1)
    if movie_score_num != "null":
        movie_score_num = text_pattern.search(etree.tostring(movie_score_num).decode("utf8")).group(1)

    data_to_be_replace.append(movie_score)
    data_to_be_replace.append(movie_score_num)

    movie_box = find_certain_part(html, xpaths.get("movie_box"))
    if movie_box != "null":
        movie_box = text_pattern.search(etree.tostring(movie_box).decode("utf8")).group(1)
    movie_box_unit = find_certain_part(html, xpaths.get("movie_box_unit"))

    data_to_be_replace.append(movie_box)


    # Check if it is a string
    for item in data_to_be_replace:
        assert isinstance(item, str)
    # Convert unicode encoded strings to numbers
    for key, value in img_to_num.items():
        new_key = f"&#{key};"
        for i in range(len(data_to_be_replace)):
            if data_to_be_replace[i] == "null":
                continue
            if new_key in data_to_be_replace[i]:
                data_to_be_replace[i] = data_to_be_replace[i].replace(new_key, value)

    movie_score, movie_score_num, movie_box = [unescape(item) for item in data_to_be_replace]
    # If there is no score, it is regarded as 0
    if movie_score == "null":
        movie_score = "0"
    if movie_box != "null":
        movie_box = movie_box + movie_box_unit.strip()

    movie_brief_info = find_certain_part(html, xpaths.get("movie_brief_info"))
    assert(isinstance(movie_brief_info, str))
    # There is a little problem with the implementation logic here, because the first one is the director by default
    movie_director, *movie_actors = [item.strip() for item in html.xpath("//body//div[@id='app']//div//div//div//div[@class='tab-content-container']//div//div[@class='mod-content']//div//div//ul//li//div//a/text()")]
    movie_actors = ",".join(movie_actors)

    movie_comments = {}
    try:
        names = html.xpath("//body//div[@id='app']//div//div//div//div//div[@class='module']//div[@class='mod-content']//div[@class='comment-list-container']//ul//li//div//div[@class='user']//span[@class='name']/text()")
        comments = html.xpath("//body//div[@id='app']//div//div//div//div//div[@class='module']//div[@class='mod-content']//div[@class='comment-list-container']//ul//li//div[@class='main']//div[@class='comment-content']/text()")
        assert(len(names) == len(comments))
        for name, comment in zip(names, comments):
            movie_comments[name] = comment
    except Exception:
        pass

    save_id = saver.insert_dict({
        "Name": movie_name,
        "alias": movie_ename,
        "category": movie_classes,
        "Duration": movie_length,
        "Release time": movie_showtime,
        "score": float(movie_score),
        "Score number": movie_score_num,
        "box office": movie_box,
        "brief introduction": movie_brief_info,
        "director": movie_director,
        "performer": movie_actors,
        "Hot Reviews": movie_comments
    })
    print(f"{save_id} Save successfully")

# Crawls the source code, processes the font file after obtaining the source code, and replaces it after processing the font file
def get_one_film(url, ua, film_id, saver):
    headers = {
        "User-Agent": ua,
        "Host": "maoyan.com"
    }
    r = requests.get(url=url, headers=headers)
    if r.status_code == 200:
        source_code = r.text
        font_pattern = re.compile("url\(\'(.*?\.woff)\'\)")
        font_url = "http:" + font_pattern.search(r.text).group(1).strip()
        del headers["Host"]
        res = requests.get(url=font_url, headers=headers)
        # Download fonts and identify them
        if res.status_code == 200:
            if os.path.exists(film_id):
                os.system(f"rmdir /s /q {film_id}")
            os.makedirs(film_id)
            woff_path = os.path.join(film_id, "temp.woff")
            img_dir = os.path.join(film_id, "images")
            os.makedirs(img_dir)
            with open(woff_path, "wb") as f:
                f.write(res.content)
            woff_to_image(woff_path, img_dir)
            # Try to realize Chinese character recognition with CO process later
            # Direct identification first
            # Stored in a dictionary, {"img_id": "img_num"}
            img_to_num = verify_img(img_dir)
            # Delete created files
            os.system(f"rmdir /s /q {film_id}")
            # Further processing of obtained data and replaceable information
            parse_data_by_lxml(source_code, img_to_num, saver)

def get_urls(url, ua, showType, offset):

    base_url = "https://maoyan.com"
    headers = {
        "User-Agent": ua,
        "Host": "maoyan.com"
    }

    params = {
        "showType": showType,
        "offset": offset
    }

    urls = []
    r = requests.get(url=url, headers=headers, params=params)
    if r.status_code == 200:
        doc = pq(r.text)
        for re_url in doc("#app div div[class='movies-list'] dl dd div[class='movie-item'] a[target='_blank']").items():
            urls.append(base_url + re_url.attr("href"))
    film_urls.extend(urls)
    print(f"Current capture url{len(film_urls)}individual")


if __name__ == "__main__":
    # test
    ua = fake_useragent.UserAgent()
    tasks_one = []
    try:
        for i in range(68):
            tasks_one.append(SpiderThread(get_urls, args=("https://maoyan.com/films", ua.random, "3", str(30*i))))
        for task in tasks_one:
            task.start()
        for task in tasks_one:
            task.join()
    except Exception as e:
        print(e.args)
    saver = SaveInfo()
    film_ids = [url.split("/")[-1] for url in film_urls]
    print(f"Capture movies url common{len(film_urls)}strip")
    tasks_two = []
    count = 0
    try:
        for film_url, film_id in zip(film_urls, film_ids):
            tasks_two.append(SpiderThread(get_one_film, args=(film_url, ua.random, film_id, saver)))
        for task in tasks_two:
            task.start()
            count += 1
            if count % 4 == 0:
                time.sleep(5)
        for task in tasks_two:
            task.join()
    except Exception as e:
        print(e.args)
    print("After grabbing")
resizeimage.py
from PIL import Image
import os

def resize_img(img_path, write_path):
    crop_size = (120, 200)
    img = Image.open(img_path)
    new_img = img.resize(crop_size, Image.ANTIALIAS)
    new_img.save(write_path, quality=100)

if __name__ == "__main__":
    for root, dirs, files in os.walk("verify_images"):
        for file in files:
            img_path = os.path.join(root, file)
            write_path = os.path.join("resized_images", file)
            resize_img(img_path, write_path)
SaveMovieData.py
import pymongo


class SaveInfo:

    def __init__(self, host="localhost", port=27017, db="MovieSpider",
                 collection="maoyan"):
        self._client = pymongo.MongoClient(host=host, port=port)
        self._db = self._client[db]
        self._collection = self._db[collection]

    def insert_dict(self, data: dict):
        result = self._collection.insert_one(data)
        return result.inserted_id
woffToPng.py
from __future__ import print_function, division, absolute_import
from fontTools.ttLib import TTFont
from fontTools.pens.basePen import BasePen
from reportlab.graphics.shapes import Path
from reportlab.lib import colors
from reportlab.graphics import renderPM
from reportlab.graphics.shapes import Group, Drawing


class ReportLabPen(BasePen):
    """A pen for drawing onto a reportlab.graphics.shapes.Path object."""

    def __init__(self, glyphSet, path=None):
        BasePen.__init__(self, glyphSet)
        if path is None:
            path = Path()
        self.path = path

    def _moveTo(self, p):
        (x, y) = p
        self.path.moveTo(x, y)

    def _lineTo(self, p):
        (x, y) = p
        self.path.lineTo(x, y)

    def _curveToOne(self, p1, p2, p3):
        (x1, y1) = p1
        (x2, y2) = p2
        (x3, y3) = p3
        self.path.curveTo(x1, y1, x2, y2, x3, y3)

    def _closePath(self):
        self.path.closePath()


def woff_to_image(fontName, imagePath, fmt="png"):
    font = TTFont(fontName)
    gs = font.getGlyphSet()
    glyphNames = font.getGlyphNames()
    for i in glyphNames:
        if i == 'x' or i == "glyph00000":  # Skip '. notdef', '.null'
            continue

        g = gs[i]
        pen = ReportLabPen(gs, Path(fillColor=colors.black, strokeWidth=5))
        g.draw(pen)
        w, h = 600, 1000
        g = Group(pen.path)
        g.translate(0, 200)

        d = Drawing(w, h)
        d.add(g)
        imageFile = imagePath + "/" + i + ".png"
        renderPM.drawToFile(d, imageFile, fmt)
import threading


class SpiderThread(threading.Thread):

    def __init__(self, func, args=()):
        super().__init__()
        self.func = func
        self.args = args

    def run(self) -> None:
        self.result = self.func(*self.args)

    # Equivalent to no multithreading
    # def get_result(self):
    #     threading.Thread.join(self)
    #     try:
    #         return self.result
    #     except Exception as e:
    #         print(e.args)
    #         return None
somexpaths.json
{
  "movie_name": "//body//div[@class='banner']//div//div[@class='movie-brief-container']//h3/text()",
  "movie_ename": "//body//div[@class='banner']//div//div[@class='movie-brief-container']//div/text()",
  "movie_classes": "//body//div[@class='banner']//div//div[@class='movie-brief-container']//ul//li[1]/text()",
  "movie_length": "//body//div[@class='banner']//div//div[@class='movie-brief-container']//ul//li[2]/text()",
  "movie_showtime": "//body//div[@class='banner']//div//div[@class='movie-brief-container']//ul//li[3]/text()",
  "movie_score": "//body//div[@class='banner']//div//div[@class='movie-stats-container']//div//span[@class='index-left info-num ']//span",
  "movie_score_num": "//body//div[@class='banner']//div//div[@class='movie-stats-container']//div//span[@class='score-num']//span",
  "movie_box": "//body//div[@class='wrapper clearfix']//div//div//div//div[@class='movie-index-content box']//span[@class='stonefont']",
  "movie_box_unit": "//body//div[@class='wrapper clearfix']//div//div//div//div[@class='movie-index-content box']//span[@class='unit']/text()",
  "movie_brief_info": "//body//div[@class='container']//div//div//div//div[@class='tab-content-container']//div//div//div[@class='mod-content']//span[@class='dra']/text()",
  "movie_director_and_actress": "//body//div[@id='app']//div//div//div//div[@class='tab-content-container']//div//div[@class='mod-content']//div//div//ul//li//div//a/text()",
  "commenter_names": "//body//div[@id='app']//div//div//div//div//div[@class='module']//div[@class='mod-content']//div[@class='comment-list-container']//ul//li//div//div[@class='user']//span[@class='name']/text()",
  "commenter_comment": "//body//div[@id='app']//div//div//div//div//div[@class='module']//div[@class='mod-content']//div[@class='comment-list-container']//ul//li//div[@class='main']//div[@class='comment-content']/text()"
}

Reference material

Posted by pramodv on Mon, 04 Nov 2019 06:09:59 -0800