Use python to grab all the crawlers of a certain Teng video. You can watch member movies without money!

Keywords: Python

Crawler of capturing all Tencent videos with python

# -*- coding: utf-8 -*-
import re
import urllib2
from bs4 import BeautifulSoup
import string, time
import pymongo
  
NUM     = 0         #Global variables, number of movies
m_type  = u''       #Global variable, movie type
m_site  = u'qq' #Global variables, movie sites
  
#Get web page content according to the specified URL
def gethtml(url):
    req = urllib2.Request(url)
    response = urllib2.urlopen(req)
    html = response.read()
    return html
  
#Get movie categories from the movie category list page
def gettags(html):
    global m_type
    soup = BeautifulSoup(html)      #Filter out classification content
    #print soup
    #<ul class="clearfix _group" gname="mi_type" gtype="1">
    tags_all = soup.find_all('ul', {'class' : 'clearfix _group' , 'gname' : 'mi_type'})
    #print len(tags_all), tags_all
    #print str(tags_all[1]).replace('\n', '')
  
    #< a ﹐ hot = "tag. Sub" class = "﹐ gtag ﹐ hotkey" href = "http://v.qq.com/list/1 ﹐ 0-1 ﹐ 1 ﹐ 0 ﹐ 20 ﹐ 0-1 ﹐ 0. HTML" title = "action" tvalue = "0" > action</a>
    re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
    p = re.compile(re_tags, re.DOTALL)
  
    tags = p.findall(str(tags_all[0]))
    if tags:
        tags_url = {}
        #print tags
        for tag in tags:
            tag_url = tag[0].decode('utf-8')
            #print tag_url
            m_type = tag[1].decode('utf-8')
            tags_url[m_type] = tag_url
              
    else:
            print "Not Find"
    return tags_url

#Get the number of pages per category
def get_pages(tag_url):
    tag_html = gethtml(tag_url)
    #div class="paginator
    soup = BeautifulSoup(tag_html)      #Filter out the html of the marked page
    #print soup
    #<div class="mod_pagenav" id="pager">
    div_page = soup.find_all('div', {'class' : 'mod_pagenav', 'id' : 'pager'})
    #print div_page #len(div_page), div_page[0]
  
    #<a class="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>
    re_pages = r'<a class=.+?><span>(.+?)</span></a>'
    p = re.compile(re_pages, re.DOTALL)
    pages = p.findall(str(div_page[0]))
    #print pages
    if len(pages) > 1:
        return pages[-2]
    else:
        return 1
      
  
def getmovielist(html):
    soup = BeautifulSoup(html)
  
    #<ul class="mod_list_pic_130">
    divs = soup.find_all('ul', {'class' : 'mod_list_pic_130'})
    #print divs
    for div_html in divs:
        div_html = str(div_html).replace('\n', '')
        #print div_html
        getmovie(div_html)

def getmovie(html):
    global NUM
    global m_type
    global m_site
  
    re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'
    p = re.compile(re_movie, re.DOTALL)
    movies = p.findall(html)
    if movies:
        conn = pymongo.Connection('localhost', 27017)
        movie_db = conn.dianying
        playlinks = movie_db.playlinks
        #print movies
        for movie in movies:
            #print movie
            NUM += 1
            print "%s : %d" % ("=" * 70, NUM)
            values = dict(
                movie_title = movie[1],
                movie_url   = movie[0],
                movie_site      = m_site,
                movie_type      = m_type
                )
            print values
            playlinks.insert(values)
            print "_" * 70
            NUM += 1
            print "%s : %d" % ("=" * 70, NUM)
  
    #else:
    #   print "Not Find"
  
def getmovieinfo(url):
    html = gethtml(url)
    soup = BeautifulSoup(html)
  
    #pack pack_album album_cover
    divs = soup.find_all('div', {'class' : 'pack pack_album album_cover'})
    #print divs[0]
  
    #< a href = "http://www.tudou.com/albumlay/9nyofxc_lhi/32jqhikjyki. HTML" target = "new" title = "" blood drop "exclusive documentary" WL = "1" ></a>
    re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'
    p_info = re.compile(re_info, re.DOTALL)
    m_info = p_info.findall(str(divs[0]))
    if m_info:
        return m_info
    else:
        print "Not find movie info"
  
    return m_info
  
  
def insertdb(movieinfo):
    global conn
    movie_db = conn.dianying_at
    movies = movie_db.movies
    movies.insert(movieinfo)
  
if __name__ == "__main__":
    global conn
  
    tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
    #print tags_url
    tags_html = gethtml(tags_url)
    #print tags_html
    tag_urls = gettags(tags_html)
    #print tag_urls
  
  
    for url in tag_urls.items():
        print  str(url[1]).encode('utf-8') #,url[0]
        maxpage = int(get_pages(str(url[1]).encode('utf-8')))
        print maxpage
  
        for x in range(0, maxpage):
            #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
            m_url = str(url[1]).replace('0_20_0_-1_0.html', '')
            movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)
            print movie_url
            movie_html = gethtml(movie_url.encode('utf-8'))
            #print movie_html
            getmovielist(movie_html)
            time.sleep(0.1)

Finally, if you want to learn python and crawler together, you can come to my python Group [784758214], where there are installation packages and learning video materials to share for free, and friends will communicate in it, share some learning methods and small details that need to be noted, and tell some actual project cases on time every day. What's the most important thing about finding a job? What you want is your actual combat experience

Posted by Jack McSlay on Thu, 05 Dec 2019 06:47:05 -0800