< attacking entomologist > data acquisition of public opinion monitoring

Keywords: Mac OS X

Public opinion monitoring is the behavior of monitoring and predicting the public opinions and opinions on the Internet. Most monitoring technologies are based on crawlers. If we search the keywords of relevant hot events with search engines and save the results locally, the first link of public opinion monitoring is realized: real-time access to Internet data
Public opinion monitoring.png

Preliminary effect

Get data.gif

Implementation code

import requests
from lxml import etree
import os
import sys

def getData(wd):
    # Set user agent header
    headers = {
        # Set user agent header (put sheepskin on wolf)
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    }
    # Construct target web address
    target_url = "https://www.baidu.com/s?wd="+str(wd)
    # Get response
    data = requests.get(target_url, headers = headers)
    # xpath formatting
    data_etree = etree.HTML(data.content)
    # Extract data list
    content_list = data_etree.xpath('//div[@id="content_left"]/div[contains(@class, "result c-container")]')
    # Define the returned string
    result = ""
    # Get title, content, link
    for content in content_list:
        result_title = "<Title>  "
        bd_title = content.xpath('.//h3/a')
        for bd_t in bd_title:
            result_title += bd_t.xpath('string(.)')

        result_content = "<content>  "
        bd_content = content.xpath('.//div[@class="c-abstract"]')
        for bd_c in bd_content:
            result_content += bd_c.xpath('string(.)')

        result_link = "<link>  "+str(list(content.xpath('.//div[@class="f13"]/a[@class="c-showurl"]/@href'))[0])


        result_list = [result_title, "\n" , result_content , "\n", result_link, "\n", "\n"]
        for result_l in result_list:
            result += str(result_l)
    return result


# Save as file

def saveDataToFile(file_name, data):
    # Create folder
    if os.path.exists("./data/"):
        pass
    else:
        os.makedirs("./data/")

    with open("./data/"+file_name+".txt", "w+") as f:
        f.write(data)

def main():
    wd = ""
    try:
        wd = sys.argv[1]
    except:
        pass
    if (len(wd) == 0):
        wd = "Naruto"
    str_data = getData(wd)
    print(str_data)
    saveDataToFile(wd, str_data)

if __name__ == '__main__':
    main()

Posted by andre&rachel on Sat, 04 Apr 2020 08:56:05 -0700