Little White Python Reptiles

Reptiles take three steps:

1. Get the HTML of the Web page you want to crawl

2. Parse HTML to get the information you want

3. Text and picture information are downloaded locally or stored in a database.

Environment: MySQL + Python 2.7 (32 bits) + vscode

html_utils.py file (request and parse)

#-*-coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import other_utils as otherUtil
import sys

reload(sys)
sys.setdefaultencoding('UTF-8')

#Get the HTML code for the entire page
def download_page_html(url,headers):
    data = requests.get(url,headers=headers)
    return data

#Analyzing and Playing Back HTML Code-Label
def lagou_parse_page_html(data):
    soup = BeautifulSoup(data.content,"html.parser")
    #The < li > tag for obtaining all job information
    ul = soup.find_all('li',attrs={'class':'con_list_item default_list'})
    object_list = []
    for li in ul:
        #Job Title (get the data-positionname attribute value of the < li > tag)
        job_name = li.get('data-positionname')
        #Position salary
        salary_range = li.get('data-salary')
        #Corporate name
        company_name = li.get('data-company')
        #Company Address (< li > Sublabel < EM > Text under the label)
        company_address = li.find('em').string
        #Company type and financing (find the text under the < div > label of attribute class=industry, and remove the blanks on both sides)
        company_type = li.find('div',attrs={'class':'industry'}).string.strip()
        #Job requirements
        requirment = li.find('div',attrs={'class':'li_b_l'}).stripped_strings
        job_requirement = ''
        for str in requirment:
            job_requirement = str.strip()
        #Company self-description
        company_desc = li.find('div',attrs={'class':'li_b_r'}).string[1:-1]
        #Company logo
        company_logo = 'https:' + li.find('img').get('src')
        img_name = li.find('img').get('alt')
        #Download company logo to local
        otherUtil.download_img(company_logo,img_name)
        dict = {
            'jobName':job_name,
            'salaryRange':salary_range,
            'companyName':company_name,
            'companyAddress':company_address,
            'companyType':company_type,
            'jobRequirement':job_requirement,
            'companyDesc':company_desc,
            'companyLogo':company_logo
        }
        object_list.append(dict)

    #Get the next page's URL ('javascript:;'for the last page, soup seems to have to start looking at some parent node)
    next_page_a = soup.find('div',attrs={'id':'s_position_list'}).find('div',attrs={'class':'pager_container'}).contents[-2]
    next_page_url = next_page_a.get('href')

    return object_list,next_page_url

The Request module is generally understood as follows:____________< https://zhuanlan.zhihu.com/p/20410446 There are good crawler tutorials in his column.

Chinese documents of Beautiful Soup:< https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html>

In this way, the web page request and parsing is completed, and then the data is saved to the local or database according to their own needs.

other_utils.py file (download)

There are from inside< http://www.xicidaili.com/wt > Climbing instead of proxy IP code, but I did not use it, only use the function of downloading pictures.

Some websites have to do more anti-crawling when crawling, otherwise IP can easily be blocked, such as Douban top250 movie.

#-*-coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import telnetlib
import codecs
import myconfig as config

#Getting proxy IP from proxy website
def get_ip_list(url,headers):
    proxy_list = []
    data = requests.get(url,headers=headers)
    #Resolution using Beautiful Soup
    soup = BeautifulSoup(data.content,"html.parser")
    #Solving Code Parsing Problem Based on Specific Web Code
    ul_list = soup.find_all('tr',limit=20)
    for i in range(2,len(ul_list)):
        print 'wait for i:' + str(i)
        line = ul_list[i].find_all('td')
        ip = line[1].text
        port = line[2].text
        #Check the availability of proxy IP
        flag = check_proxie(ip,port)
        print str(i) + ' is ' + str(flag)
        if flag:
            proxy = get_proxy(ip,port)
            proxy_list.append(proxy)
    return proxy_list

#Verify that proxy IP is available
def check_proxie(ip,port):
    try:
        ip = str(ip)
        port = str(port)
        telnetlib.Telnet(ip,port=port,timeout=200)
    except Exception,e:
        #Abnormal information
        print e.message
        return False
    else:
        return True

#Format proxy parameters, add http/https
def get_proxy(ip,port):
    aip = ip + ':' + port
    proxy_ip = str('http://' + aip)
    proxy_ips = str('https://' + aip)
    proxy = {"http":proxy_ip,"https":proxy_ips}
    return proxy

#Company logo Download
def download_img(url,img_name):
    try:
        #The path saved locally
        temp_path = 'D:\\tarinee_study_word\\Python\\spider_project\\images\\' + str(img_name) + '.jpg'
        #The path coding problem, img_name is Chinese
        save_path = temp_path.decode('UTF-8').encode('GBK')
        r = requests.get(url)
        r.raise_for_status()
        with codecs.open(save_path,'wb') as f:
            f.write(r.content)
    except Exception as e:
        print 'Error Msg:' + str(e)

mysqldb_utils.py file (saved to database)

myconfig in the file is my configuration file, which includes the account number, password and other configuration data linked to the database.

#-*-coding:utf-8 -*-

import MySQLdb
import myconfig as config

#Storage database
def insert_data(object_list):
    #create link
    conn = MySQLdb.Connect(
        host= config.HOST,
        port= config.PORT,
        user= config.USER,
        passwd= config.PASSWD,
        db= config.DB,
        charset= config.CHARSET
    )

    #Get cursor cursor
    cur = conn.cursor()

    #Create SQL statements
    sql = "insert into t_job(company_logo,company_name,company_address,company_desc,salary_range,job_name,job_requirement,company_type) values('{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}')"
    #str = 'insert into t_job(company_logo,company_name,company_address,company_desc,salary_range,job_name,job_requirement,company_type) values({companyLogo},{companyName},{companyAddress},{companyDesc},{salaryRange},{jobName},{jobRequirement},{companyType})'
    for index in object_list:
        d = list(index.values())
        #skr = sql.format(d[0].encode('GBK'),d[1].encode('GBK'),d[2].encode('GBK'),d[3].encode('GBK'),d[4].encode('GBK'),d[5].encode('GBK'),d[6].encode('GBK'),d[7].encode('GBK'))
        skr = sql.format(d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7])
        #print skr
        cur.execute(skr)   #Execute SQL statements

    #Submission
    conn.commit()
    print 'insert success......'
    #Close object
    cur.close()
    conn.close()

index.py (startup file)

def lagou_main():
    url = config.DOWNLOAD_URL
    while(url):
        print url
        # 1. Getting the whole HTML page
        html = htmlUtil.download_page_html(url,config.HEADERS)
        # Second, parse HTML pages to get the data you want
        data,next_page_url = htmlUtil.lagou_parse_page_html(html)
        if not next_page_url == 'javascript:;':
            url = next_page_url
        else:
            url = False
        # 3. Save to the database
        dbUtil.insert_data(data)

if __name__ == '__main__':
    lagou_main()

Problems encountered in operating databases:

1. Error in installing MySQLdb (error: Microsoft Visual C++ 9.0 is required Unable to find vcvarsall.bat)

Solution:

1. Install the wheel library first to run the whl file

pip install wheel

2. Download according to your python number MySQL-python <https://www.lfd.uci.edu/~gohlke/pythonlibs/#mysql-python >

pip install MySQL_python‑1.2.5‑cp27‑none‑win_amd32.whl

2. Excute (sql [, list]) (python 2.7, when output a list object, or pass a list object, the data encoding in the list will be problematic, the single data acquisition will not be problematic) - no solution, I can only take a single one, and then splice it into the SQL statement, know, look for message help.

MySQLdb can directly pass a list object or execute emany (sql [, list]) to execute more than one item at a time to save data, but because of list encoding problems, it will report errors.

summary

Writing python code, no problem, but for python coding problems, I have seen other people's explanations, but I think I still seem to understand the stage, may need to try more in the process of writing code to realize.

Posted by bazza84 on Wed, 08 May 2019 19:12:38 -0700

Programmer Group