Python Crawler - Crawl Stock Information

Keywords: encoding

Recently, I opened a stock account to filter stocks by crawling stock information starting at 300 and 600

Only crawl information without sorting and analysis

Code Address

Included Libraries

import requests
from bs4 import BeautifulSoup
import traceback
import re

Get Web Source Information

def getHTMLText(url, code="utf-8"):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = code
        return r.text
    except:
        return ""

Select 300 or 600 shares from all stocks to join the list

def getStockList(lst, stockURL):
    html = getHTMLText(stockURL, "GB2312")
    soup = BeautifulSoup(html, 'html.parser') 
    a = soup.find_all('a')
    for i in a:
        try:
            href = i.attrs['href']
            lst.append(re.findall(r"[s][hz][36]\d{5}", href)[0])
        except:
            continue

Get stock details

Here, the total market value, net assets, net profit, P/E ratio, P/E ratio, gross interest rate, net interest rate and ROE of the stock company are selected for crawling and saved in the file.

def getStockInfo(lst, stockURL, fpath):
    Listtitle=['Name','Total Market Value','Net assets','Net profit','P/E ratio','Market Net Rate','Gross interest rate','Net interest rate','ROE']
    with open(fpath,'w',encoding='utf-8') as f:
        for i in range(len(Listtitle)):
            f.write("{0:<10}\t".format(Listtitle[i],chr(12288)))
    count = 0
    for stock in lst:
        url = stockURL + stock + ".html"
        html = getHTMLText(url,"GB2312")
        try:
            if html=="":
                continue
            List=[]
            soup = BeautifulSoup(html, 'html.parser')
            stock = soup.find('div',attrs={'class':'cwzb'}).find_all('tbody')[0]
            name=stock.find_all('b')[0]
            List.append(name.text)
            keyList = stock.find_all('td')[1:9]
            for i in range(len(keyList)):
                List.append(keyList[i].text)
            with open(fpath,'a',encoding='utf-8') as f:
                f.write('\n')
                for i in range(len(List)):
                    f.write('{0:<10}\t'.format(List[i],chr(12288)))
            count = count + 1
            print("\r Current Progress: {:.2f}%".format(count*100/len(lst)),end="")
        except:
            count = count + 1
            print("\r Current Progress: {:.2f}%".format(count*100/len(lst)),end="")
            continue

Main function call

def main():
    stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
    stock_info_url = 'http://quote.eastmoney.com/'
    output_file = './Stock.txt'
    slist=[]
    getStockList(slist, stock_list_url)
    getStockInfo(slist, stock_info_url, output_file)

Posted by dinosoup on Fri, 03 Apr 2020 12:24:10 -0700