Python动态网站的抓取

网页下载器

# coding:utf-8
import requests
import urllib2
import sys
type = sys.getfilesystemencoding()
class HtmlDownloader(object):

def download(slef, url):

if url is None:
return None

user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

headers = {'User-Agent': user_agent}
req = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(req)
if response.getcode() == 200:
html = response.read()
return html

return None

网页解析器

# coding:utf-8
import re
import json
class HtmlParser(object):

def parser_url(self, page_url, response):

pattern = re.compile(r'(http://movie.mtime.com/(\d+)/)')
urls = pattern.findall(response)
if urls != None:
# 将urls进行去重
return list(set(urls))
else:
return None

# 解析异步响应值
def parser_json(self, page_url, response):

# 将"="和";"之间的内容提取出来
pattern = re.compile(r'=(.*?);')
result = pattern.findall(response)[0]

if result != None:
value = json.loads(result)
try:
isRelease = value.get('value').get('isRelease')
except Exception, e:
print e
return None
if isRelease:
if value.get('value').get('releaseType') == None:
return self._parser_release(page_url, value)
else:
return self._parser_no_release(page_url, value, isRelease=2)
else:

return self._parser_no_release(page_url, value)

def _parser_release(self, page_url, value):

try:
isRelease = 1
movieRating = value.get('value').get('movieRating')
boxOffice = value.get('value').get('boxOffice')
moveTitle = value.get('value').get('moveTitle')
RPictureFinal = movieRating.get('RPictureFinal')
RStoryFinal = movieRating.get('RStoryFinal')
RDirectorFinal = movieRating.get('RDirectorFinal')
ROtherFinal = movieRating.get('ROtherFinal')
RathingFinal = movieRating.get('RarhingFinal')

MovieId = movieRating.get('MoviedId')
Usercount = movieRating.get('Usercount')
AttitudeCount = movieRating.get('AttitudeCount')

TotalBoxOffice = boxOffice.get('TotalBoxOffice')
TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeUnit')
TodayBoxOffice = boxOffice.get('TodayBoxOffice')
TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')

ShowDays = boxOffice.get('ShowDays')

try:

Rank = boxOffice.get('Rank')
except Exception, e:
Rank = 0

return (
MovieId, moveTitle, RathingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal, Usercount,
AttitudeCount
, TotalBoxOffice + TotalBoxOfficeUnit, TodayBoxOffice + TodayBoxOfficeUnit, Rank, ShowDays, isRelease)
except Exception, e:
print e, page_url, value

return None

# 解析未上映的电影信息
def _parser_no_release(self, page_url, value, isRelease=0):

try:
movieRating = value.get('value').get('movieRating')
moveTitle = value.get('value').get('movieTitle')
RPictureFinal = movieRating.get('RPictureFinal')
RStoryFinal = movieRating.get('RStoryFinal')
RDirectorFinal = movieRating.get('RDirectorFinal')
ROtherFinal = movieRating.get('ROtherFinal')
RatingFinal = movieRating.get('RatingFinal')

MovieId = movieRating.get('MovieId')
Usercount = movieRating.get('Usercount')
AttitudeCount = movieRating.get('AttitudeCount')

try:

Rank = 0

except Exception, e:
Rank =0
return (
MovieId, moveTitle, RatingFinal, ROtherFinal, RPictureFinal, RDirectorFinal, RStoryFinal,
Usercount,
AttitudeCount
, u'无', u'无', Rank, 0, isRelease)

except Exception, e:

print e, page_url, value

return None

数据存储器

# coding:utf-8
import MySQLdb

class DataOutput(object):

def __init__(self):
self.con =MySQLdb.connect(host='127.0.0.1', user='root', passwd='', db='go',port=3306,charset='utf8')
self.cx = self.con.cursor()
self.create_table('MTime')
self.datas = []

def create_table(self, table_name):

values = "id int(11) not null primary key auto_increment," \
"MovieId int(11),"\
"MovieTitle varchar(40) NOT NULL," \
"RatingFinal double NOT NULL DEFAULT 0.0," \
"ROtherFinal double NOT NULL DEFAULT 0.0," \
"RPictureFinal double NOT NULL DEFAULT 0.0," \
"RDirectorFinal double NOT NULL DEFAULT 0.0," \
"RStoryFinal double NOT NULL DEFAULT 0.0," \
"Usercount int(11) NOT NULL DEFAULT 0," \
"AttitudeCount int(11) NOT NULL DEFAULT 0," \
"TotalBoxOffice varchar(20) NOT NULL," \
"TodayBoxOffice varchar(20) NOT NULL," \
"Rank int(11) NOT NULL DEFAULT 0," \
"ShowDays int(11) NOT NULL DEFAULT 0," \
"isRelease int(11) NOT NULL" \
""
#print 'CREATE TABLE IF NOT EXISTS %s(%s)' % (table_name, values)

self.cx.execute('CREATE TABLE IF NOT EXISTS %s(%s) ENGINE=InnoDB DEFAULT CHARSET=utf8' % (table_name, values))

def store_data(self, data):

if data is None:
return
self.datas.append(data)
if len(self.datas) > 10:
self.output_db('MTime')

def output_db(self, table_name):
for data in self.datas:
self.cx.execute("INSERT INTO MTime (MovieId,MovieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,"
"RStoryFinal,Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,Rank,ShowDays,isRelease) "
"VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",data)

self.datas.remove(data)

self.con.commit()
self.con.close()

def output_end(self):

if len(self.datas) > 0:
self.output_db('MTime')

self.cx.close()

爬虫调度器

# coding:utf-8
from UrlManager import UrlManager
from DataOutput import DataOutput
from HtmlDownloader import HtmlDownloader
from HtmlParser import HtmlParser
import time
class SpiderMan(object):

def __init__(self):

self.downloader = HtmlDownloader()
self.parser = HtmlParser()
self.output = DataOutput()

def crawl(self,root_url):

content = self.downloader.download(root_url)

urls = self.parser.parser_url(root_url,content)

for url in urls:

try:
t= time.strftime("%Y%m%d%H%M%S3282",time.localtime())
rank_url ="http://service.library.mtime.com/Movie.api?" \
"Ajax_CallBack=true" \
"&Ajax_CallBackType=Mtime.Library.Services" \
"&Ajax_CallBackMethod=GetMovieOverviewRating" \
"&Ajax_CrossDomain=1" \
"&Ajax_RequestUrl=%s" \
"&t=%s" \
"&Ajax_CallBackArgument0=%s" %(url[0],t,url[1])

#print rank_url
#exit()
rank_content = self.downloader.download(rank_url)

data = self.parser.parser_json(rank_url,rank_content)

self.output.store_data(data)
except Exception,e:
print e
self.output.output_end()
print "Crawl finish"

if __name__ == '__main__':

spider = SpiderMan()
spider.crawl('http://theater.mtime.com/China_Beijing/')

posted on 2018-01-31 18:32 paulversion 阅读(281) 评论(0) 编辑收藏举报