python google play


 

#!/usr/env  python
#-*- coding: utf-8  -*-
import urllib 
import urllib2 
import random 
import requests
import os,sys 
import MySQLdb
from sgmllib import SGMLParser 
from BeautifulSoup import BeautifulSoup
import re
num=0
def main():
	try:
		conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='googlemarket',charset="utf8")
		conn.query("set names utf8")
	except Exception,e:
		print e
		sys.exit()
	cursor=conn.cursor() 
	category=['PERSONALIZATION','TRANSPORTATION','SPORTS','HEALTH_AND_FITNESS','APP_WALLPAPER','COMICS','MEDICAL','BUSINESS','BOOKS_AND_REFERENCE','WEATHER','ENTERTAINMENT','MEDIA_AND_VIDEO','APP_WIDGETS','TOOLS','PHOTOGRAPHY','PRODUCTIVITY','EDUCATION','NEWS_AND_MAGAZINES','TRAVEL_AND_LOCAL','LIFESTYLE','SOCIAL','FINANCE','SHOPPING','LIBRARIES_AND_DEMO','COMMUNICATION','MUSIC_AND_AUDIO','GAME']	
	for k in range(0,27):
		t="https://play.google.com/store/apps/category/"+category[k]
		html=requests.get(t)
		preresult=html.content
		soup=BeautifulSoup(preresult)
		result=soup.prettify("utf-8")
		pattern=re.compile('<a class="title" href="(.+?)" title')
		dataresult=re.findall(pattern,result)
		dataresult=list(set(dataresult))
		for i in dataresult:
			url="https://play.google.com"+i
			print url		
			#url="https://play.google.com/store/apps/details?id=com.androidesk&hl=zh_CNhttps%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fdetails%3Fid%3Dcom.androidesk"
	
			html=requests.get(url)
			preresult=html.content
			soup=BeautifulSoup(preresult)
			result=soup.prettify("utf-8")
			#名称
			pattern=re.compile('<div class="document-title" itemprop="name">[\s\S]*?<div>([\s\S]*?)</div>')
			data0=re.findall(pattern,result)
			for items in data0:
				print items
			#制造商
			pattern=re.compile('itemprop="name">([\s\S]*?)</a>')
			data1=re.findall(pattern,result)
		
			make=data1[0].split("\n")
		
			print make[8]
			#版本
			pattern=re.compile('itemprop="softwareVersion">([\s\S]*?)</div>')
			data2=re.findall(pattern,result)
			print data2[0]
			#更新时间 
			pattern=re.compile('itemprop="datePublished">([\s\S]*?)</div>')
			data3=re.findall(pattern,result)
			print data3[0]
			#文件大小
			pattern=re.compile('itemprop="fileSize">([\s\S]*?)</div>')
			data4=re.findall(pattern,result)
			print data4[0]
			#支持固件
			pattern=re.compile('itemprop="operatingSystems">([\s\S]*?)</div>')
			data5=re.findall(pattern,result)
			print data5[0]
			#说明
			pattern=re.compile('itemprop="description">[\s\S]*?<div>([\s\S]*?)</div>')
			data6=re.findall(pattern,result)
			for items in data6:
				print re.sub('[<br /> <p> </p>]',' ',items)
			sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)"
			for items in data6:
			
				if(data5):
					#values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('<br />',' ',items))
				#else:
					#values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('<br />',' ',items))
				#print values
				#print sql % values
				#cursor.execute(sql,values)
				#conn.commit()
			pattern=re.compile('<img class="cover-image" src=(.+?) alt="Cover art" itemprop="image" />')
			data=re.findall(pattern,result)
			global num
			for j in data:
				print j
				print type(j)
				headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'}
				temp=requests.get(j[1:-2], headers=headers)
			f=file("googlemarket/"+str(num),"w+")
			num=num+1
			print num
			f.write(temp.content)
	
			
		
				
	
    
if  __name__=="__main__":
       main()


 



<type 'str'>
Traceback (most recent call last):
  File "crawler0729.py", line 103, in <module>
    main()
  File "crawler0729.py", line 91, in main
    temp=requests.get(j[1:-2], headers=headers)
  File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 55, in get
    return request('get', url, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 44, in request
    return session.request(method=method, url=url, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 335, in request
    resp = self.send(prep, **send_kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 438, in send
    r = adapter.send(request, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 327, in send
    raise ConnectionError(e)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='lh3.ggpht.com', port=443): Max retries exceeded with url: /RBld17rLw4Ik0JtOaKk4bZB2RiGJ2R8H5Q8Rjw3Hh6BAM694fOzzKj1TJFr7R02ZS_40=w30 (Caused by <class 'socket.error'>: [Errno 101] Network is unreachable)


 

posted @ 2013-07-31 21:02  javawebsoa  Views(477)  Comments(0Edit  收藏  举报