Reptilian Practice

Keywords: Big Data JSON Python Mac OS X

Today we practice crawling a website and summarize the crawling template of similar websites.
Let's take a website like http://www.simm.cas.cn/xwzx/kydt/ as an example. The goal is to crawl the title, release time, article links, picture links, and source of the news.

We mainly use requests, re, Beautiful Soup, JSON modules.
Go directly to the code, where there are errors or can be changed, I hope you can criticize and correct.

#!/usr/bin/env python
#-*- coding:utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import  json

baseurl = "http://www.simm.cas.cn/xwzx/kydt/"
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}

def reptile_news(url):
    book = {}
    html = requests.get(url,headers = headers)
    soup = BeautifulSoup(html.content,"lxml")

    book["title"] = soup.find("td",class_="newtitle").text #Get the text title

    html = str(soup.find_all("td",class_ = "hui12_sj2"))
    book["time"] = re.search("Date of publication:</td>, <td[^>]+>(.*)</td>",html).group(1) #Get new time

    content_items = soup.find("div",class_ = "TRS_Editor").find_all("font")#Getting Text Information from news
    content = ""
    for i in content_items:
        content = content + re.sub("<[^>]+>","",str(i)) + "<br><br>"
    book["content"] = "".join(content.split())

    img_url = baseurl + url.split("/")[-2] #Get links to news pictures
    try:
        img = img_url + soup.find("div",class_ = "TRS_Editor").img["src"][1:]
    except:
        img = "NULL"
    book["img_url"]  = img_url

    return book
def reptile_list(url):
    lists = []
    book = {}
    html = requests.get(url,headers = headers)
    soup = BeautifulSoup(html.content,"lxml")
    items = soup.find_all("a",class_ ="lefttitle3")
    for i in items:
        book["news_url"] = baseurl + i["href"][1:]
        book.update(reptile_news(book["news_url"]))
        lists.append(book)
        print(book)
    return lists

if __name__ == "__main__":
    limit = 10 #Climb 10 pages of news
    lists = []
    url  = baseurl
    for i in range(0,limit):
        lists.extend(reptile_list(url)) #Add data
        html = requests.get(url,headers = headers)
        soup = BeautifulSoup(html.content,"lxml")
        html_url = str(soup.find_all("a",class_="h12"))
        url = baseurl + re.search("href=([^>]*) id=[^>]+>next page</a>",html_url).group(1)[1:-1]
    with open("/Users/caipeng/PycharmProjects/practice/simm_base.json", "w", encoding='utf-8') as f:
        json.dump(lists, f, ensure_ascii=False)

Final Preservation Effect

Posted by kir10s on Thu, 24 Jan 2019 10:15:13 -0800