< attacking entomologist > data acquisition of public opinion monitoring
Keywords:
Mac
OS X
Public opinion monitoring is the behavior of monitoring and predicting the public opinions and opinions on the Internet. Most monitoring technologies are based on crawlers. If we search the keywords of relevant hot events with search engines and save the results locally, the first link of public opinion monitoring is realized: real-time access to Internet data
Public opinion monitoring.png
Preliminary effect
Implementation code
import requests
from lxml import etree
import os
import sys
def getData(wd):
# Set user agent header
headers = {
# Set user agent header (put sheepskin on wolf)
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
}
# Construct target web address
target_url = "https://www.baidu.com/s?wd="+str(wd)
# Get response
data = requests.get(target_url, headers = headers)
# xpath formatting
data_etree = etree.HTML(data.content)
# Extract data list
content_list = data_etree.xpath('//div[@id="content_left"]/div[contains(@class, "result c-container")]')
# Define the returned string
result = ""
# Get title, content, link
for content in content_list:
result_title = "<Title> "
bd_title = content.xpath('.//h3/a')
for bd_t in bd_title:
result_title += bd_t.xpath('string(.)')
result_content = "<content> "
bd_content = content.xpath('.//div[@class="c-abstract"]')
for bd_c in bd_content:
result_content += bd_c.xpath('string(.)')
result_link = "<link> "+str(list(content.xpath('.//div[@class="f13"]/a[@class="c-showurl"]/@href'))[0])
result_list = [result_title, "\n" , result_content , "\n", result_link, "\n", "\n"]
for result_l in result_list:
result += str(result_l)
return result
# Save as file
def saveDataToFile(file_name, data):
# Create folder
if os.path.exists("./data/"):
pass
else:
os.makedirs("./data/")
with open("./data/"+file_name+".txt", "w+") as f:
f.write(data)
def main():
wd = ""
try:
wd = sys.argv[1]
except:
pass
if (len(wd) == 0):
wd = "Naruto"
str_data = getData(wd)
print(str_data)
saveDataToFile(wd, str_data)
if __name__ == '__main__':
main()
Posted by andre&rachel on Sat, 04 Apr 2020 08:56:05 -0700