Use Python to capture all the movies of Mouton Video, and you can watch VIP movies without money!

Keywords: Python Programming Web Development Django

Crawler for all movie crawlers implemented by python

# -*- coding: utf-8 -*-
import re
import urllib2
from bs4 import BeautifulSoup
import string, time
import pymongo
  
NUM     = 0         #Global variables, number of movies
m_type  = u''       #Global variables, movie types
m_site  = u'qq' #Global Variables, Movie Websites
  
#Get the content of the page based on the specified URL
def gethtml(url):
    req = urllib2.Request(url)
    response = urllib2.urlopen(req)
    html = response.read()
    return html
  
#Getting Film Categories from Film Categories List Page
def gettags(html):
    global m_type
    soup = BeautifulSoup(html)      #Filter out the categorized content
    #print soup
    #<ul class="clearfix _group" gname="mi_type" gtype="1">
    tags_all = soup.find_all('ul', {'class' : 'clearfix _group' , 'gname' : 'mi_type'})
    #print len(tags_all), tags_all
    #print str(tags_all[1]).replace('\n', '')
  
    #<a_hot= "tag.sub" class="_gtag_hotkey" href= "http://v.qqq.com/list/1_0_-1_-1_1_0_0_20_0_-1.html" title= "action" tvalue= "0">action</a>
    re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
    p = re.compile(re_tags, re.DOTALL)
  
    tags = p.findall(str(tags_all[0]))
    if tags:
        tags_url = {}
        #print tags
        for tag in tags:
            tag_url = tag[0].decode('utf-8')
            #print tag_url
            m_type = tag[1].decode('utf-8')
            tags_url[m_type] = tag_url
              
    else:
            print "Not Find"
    return tags_url

If you are still confused in the world of programming, you can join our Python Learning button qun: 784758214 to see how our predecessors learned. Exchange of experience. From basic Python script to web development, crawler, django, data mining, zero-base to actual project data are sorted out. To every little friend of Python! Share some learning methods and small details that need attention. Click to join us. python learner gathering place

#Get the number of pages per category
def get_pages(tag_url):
    tag_html = gethtml(tag_url)
    #div class="paginator
    soup = BeautifulSoup(tag_html)      #Filter out html for tagged pages
    #print soup
    #<div class="mod_pagenav" id="pager">
    div_page = soup.find_all('div', {'class' : 'mod_pagenav', 'id' : 'pager'})
    #print div_page #len(div_page), div_page[0]
  
    #<a class="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>
    re_pages = r'<a class=.+?><span>(.+?)</span></a>'
    p = re.compile(re_pages, re.DOTALL)
    pages = p.findall(str(div_page[0]))
    #print pages
    if len(pages) > 1:
        return pages[-2]
    else:
        return 1
      
  
def getmovielist(html):
    soup = BeautifulSoup(html)
  
    #<ul class="mod_list_pic_130">
    divs = soup.find_all('ul', {'class' : 'mod_list_pic_130'})
    #print divs
    for div_html in divs:
        div_html = str(div_html).replace('\n', '')
        #print div_html
        getmovie(div_html)

def getmovie(html):
    global NUM
    global m_type
    global m_site
  
    re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'
    p = re.compile(re_movie, re.DOTALL)
    movies = p.findall(html)
    if movies:
        conn = pymongo.Connection('localhost', 27017)
        movie_db = conn.dianying
        playlinks = movie_db.playlinks
        #print movies
        for movie in movies:
            #print movie
            NUM += 1
            print "%s : %d" % ("=" * 70, NUM)
            values = dict(
                movie_title = movie[1],
                movie_url   = movie[0],
                movie_site      = m_site,
                movie_type      = m_type
                )
            print values
            playlinks.insert(values)
            print "_" * 70
            NUM += 1
            print "%s : %d" % ("=" * 70, NUM)
  
    #else:
    #   print "Not Find"
  
def getmovieinfo(url):
    html = gethtml(url)
    soup = BeautifulSoup(html)
  
    #pack pack_album album_cover
    divs = soup.find_all('div', {'class' : 'pack pack_album album_cover'})
    #print divs[0]
  
    #<a href= "http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiK JykI.html" target= "new" title= "Blood Drops" exclusive documentary"wl="1"></a>
    re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'
    p_info = re.compile(re_info, re.DOTALL)
    m_info = p_info.findall(str(divs[0]))
    if m_info:
        return m_info
    else:
        print "Not find movie info"
  
    return m_info
  
  
def insertdb(movieinfo):
    global conn
    movie_db = conn.dianying_at
    movies = movie_db.movies
    movies.insert(movieinfo)
  
if __name__ == "__main__":
    global conn
  
    tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
    #print tags_url
    tags_html = gethtml(tags_url)
    #print tags_html
    tag_urls = gettags(tags_html)
    #print tag_urls
  
  
    for url in tag_urls.items():
        print  str(url[1]).encode('utf-8') #,url[0]
        maxpage = int(get_pages(str(url[1]).encode('utf-8')))
        print maxpage
  
        for x in range(0, maxpage):
            #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
            m_url = str(url[1]).replace('0_20_0_-1_0.html', '')
            movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)
            print movie_url
            movie_html = gethtml(movie_url.encode('utf-8'))
            #print movie_html
            getmovielist(movie_html)
            time.sleep(0.1)
Python Resource sharing qun 784758214 ,Installation packages are included. PDF,Learning videos, here is Python The gathering place of learners, zero foundation and advanced level are all welcomed.

Posted by eRott on Tue, 01 Oct 2019 02:45:40 -0700