Course assignment - get all campus news

Keywords: Python encoding PHP codec Database

Operation requirements:

  1. Take out all the news of a news list page and wrap it into a function.
  2. Get the total number of news articles and calculate the total number of pages.
  3. Get all news details for all news list pages.
  4. Find a topic that you are interested in, do data crawling and word segmentation analysis. Can't be the same as other students.

The first three requirement codes are as follows:

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

# Setting local is to process the time containing Chinese format (% Y% m% d) with an error:
# UnicodeEncodeError: 'locale' codec can't encode character '\u5e74'
# import locale
# locale.setlocale(locale.LC_CTYPE, 'chinese')


def crawlOnePageSchoolNews(page_url):
    res = requests.get(page_url)
    res.encoding = 'UTF-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    news = soup.select('.news-list > li')
    for n in news:
        # print(n)
        print('**' * 5 + 'List page information' + '**' * 10)
        print('News link:' + n.a.attrs['href'])
        print('Headline:' + n.select('.news-list-title')[0].text)
        print('News Description:' + n.a.select('.news-list-description')[0].text)
        print('News time:' + n.a.select('.news-list-info > span')[0].text)
        print('Source:' + n.a.select('.news-list-info > span')[1].text)
        getNewDetail(n.a.attrs['href'])

def getNewDetail(href):
    print('**' * 5 + 'Details page information' + '**' * 10)
    res1 = requests.get(href)
    res1.encoding = 'UTF-8'
    soup1 = BeautifulSoup(res1.text, 'html.parser')
    if soup1.select('.show-info'): # Prevent the previous web page from not showing_info
        news_info = soup1.select('.show-info')[0].text
    else:return
    info_list = ['source', 'Release time', 'click', 'author', 'To examine', 'Photography']  # Fields to be resolved
    news_info_set = set(news_info.split('\xa0')) - {' ', ''}  # The & nbsp; in the web page will be resolved to \ xa0 after it is obtained, so you can use \ xa0 as the separator
    # Cycle print article information
    for n_i in news_info_set:
        for info_flag in info_list:
            if n_i.find(info_flag) != -1:  # Because the colon of time uses the English character, it needs to be judged
                if info_flag == 'Release time':
                    # Convert the publishing time string to datetime format for later storage in the database
                    release_time = datetime.strptime(n_i[n_i.index(':') + 1:], '%Y-%m-%d %H:%M:%S ')
                    print(info_flag + ':', release_time)
                elif info_flag == 'click':  # The number of clicks is written using js after accessing php through the article id, so it is handled separately here
                    getClickCount(href)
                else:
                    print(info_flag + ':' + n_i[n_i.index(': ') + 1:])
    news_content = soup1.select('#content')[0].text
    print(news_content)  # Article content
    print('--' * 40)


def getClickCount(news_url):
    # http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80
    # The above link gives the URL of the number of visits for the article page
    click_num_url = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'
    # Get article id from regular expression
    click_num_url = click_num_url.format(re.search('_(.*)/(.*).html', news_url).group(2))
    res2 = requests.get(click_num_url)
    res2.encoding = 'UTF-8'
    # $('#todaydowns').html('5');$('#weekdowns').html('106');$('#monthdowns').html('129');$('#hits').html('399');
    # Above is the content of response

    # Using regular expression to get the number of hits
    # Res2. Text [res2. Text. Rindex ("(') + 2: res2. Text. Rindex ("'))], not in a regular way
    print('click:' + re.search("\$\('#hits'\).html\('(\d*)'\)", res2.text).group(1))


crawlOnePageSchoolNews('http://news.gzcc.cn/html/xiaoyuanxinwen/')

pageURL = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'
res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
res.encoding = 'UTF-8'
soup = BeautifulSoup(res.text, 'html.parser')
newsSum = int(re.search('(\d*)strip', soup.select('a.a1')[0].text).group(1))
if newsSum % 10:
    pageSum = int(newsSum/10) + 1
else:
    pageSum = int(newsSum/10)

for i in range(2, pageSum+1):
    crawlOnePageSchoolNews(pageURL.format(i))

Screenshot of results:


In the fourth requirement, I crawled through all the news descriptions on campus, analyzed what the school had done in recent years, where it had done it, what it emphasized, and counted the word cloud.
The main codes are as follows:

import requests
from bs4 import BeautifulSoup
import re
import jieba

editors = []
descriptions = ''


def crawlOnePageSchoolNews(page_url):
    global descriptions
    res0 = requests.get(page_url)
    res0.encoding = 'UTF-8'
    soup0 = BeautifulSoup(res0.text, 'html.parser')
    news = soup0.select('.news-list > li')
    for n in news:
        print('News Description:' + n.a.select('.news-list-description')[0].text)
        print('Source:' + n.a.select('.news-list-info > span')[1].text)
        descriptions = descriptions + ' ' + n.a.select('.news-list-description')[0].text
        editors.append(n.a.select('.news-list-info > span')[1].text.split(' ')[0])


crawlOnePageSchoolNews('http://news.gzcc.cn/html/xiaoyuanxinwen/')

pageURL = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'
res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/')
res.encoding = 'UTF-8'
soup = BeautifulSoup(res.text, 'html.parser')
newsSum = int(re.search('(\d*)strip', soup.select('a.a1')[0].text).group(1))
if newsSum % 10:
    pageSum = int(newsSum / 10) + 1
else:
    pageSum = int(newsSum / 10)

for i in range(2, pageSum+1):
    crawlOnePageSchoolNews(pageURL.format(i))

with open('punctuation.txt', 'r', encoding='UTF-8') as punctuationFile:
    for punctuation in punctuationFile.readlines():
        descriptions = descriptions.replace(punctuation[0], ' ')

with open('meaningless.txt', 'r', encoding='UTF-8') as meaninglessFile:
    mLessSet = set(meaninglessFile.read().split('\n'))
mLessSet.add(' ')

# Load reserved words
with open('reservedWord.txt', 'r', encoding='UTF-8') as reservedWordFile:
    reservedWordSet = set(reservedWordFile.read().split('\n'))
    for reservedWord in reservedWordSet:
        jieba.add_word(reservedWord)


keywordList = list(jieba.cut(descriptions))
keywordSet = set(keywordList) - mLessSet  # Remove meaningless words from word set

keywordDict = {}
# Count word frequency dictionary
for word in keywordSet:
    keywordDict[word] = keywordList.count(word)

# Sort word frequency
keywordListSorted = list(keywordDict.items())
keywordListSorted.sort(key=lambda e: e[1], reverse=True)
# Write all word frequencies to txt for word cloud analysis
for topWordTup in keywordListSorted:
    print(topWordTup)
    with open('word.txt', 'a+', encoding='UTF-8') as wordFile:
        for i in range(0, topWordTup[1]):
            wordFile.write(topWordTup[0]+'\n')

Pass the result after the above processing https://wordsift.org/ The generated word cloud is as follows:

Some reserved words are not handled well, so some meaningless words are ignored selectively
The file above has been uploaded Here

Posted by zubinkasad on Thu, 02 Apr 2020 22:18:00 -0700