代码改变世界

初识python之 APP store排行榜 蜘蛛抓取(三) ---多线程版

2015-01-05 01:07  etodream  阅读(588)  评论(0编辑  收藏  举报
#coding=utf-8
import time
import urllib2
import Queue
import threading
import xml.dom.minidom
import MySQLdb


#获取数据
def get_appstop_data(url): 
    myurl = url   
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'  
    headers={ 'User-Agent' : user_agent }  
    req=urllib2.Request(myurl, headers = headers)  
    myResponse=urllib2.urlopen(req)  
    try:
        myResponse = urllib2.urlopen(req)
    except urllib2.HTTPError, e:
        print e.fp.read()    
    resultXml = myResponse.read()
    
    return resultXml


def get_attrvalue(node, attrname):
    if node:
        return node.getAttribute(attrname)
    else:
        return ''

def get_xmlnode(node,name):
    if node:
        return node.getElementsByTagName(name)
    else:
        return []

def get_nodevalue(node, index = 0):
    if node:
        return node.childNodes[index].nodeValue
    else:
        return ''

#解析XML
def get_xml_data(xml_string,table,type,itunesid=0):
    doc = xml.dom.minidom.parseString(xml_string)
        app_list = []
    for index,node in enumerate(doc.getElementsByTagName('entry')):  
            #获取app应用标题
        node_title = get_xmlnode(node,'title')
        #获取应用名称
        node_name = get_xmlnode(node,'im:name')
        #获取更新时间
        node_updated = get_xmlnode(node,'updated')
        #获取应用ID
        node_appid = get_xmlnode(node,'id')
        #获取应用类型    
        node_app_category = get_xmlnode(node,'category')
        #获取应用金额
        node_price = get_xmlnode(node,'im:price')    
        #获取应用详情
        node_content = get_xmlnode(node,'content')
        #获取应用LOGO
        node_logo = get_xmlnode(node,'im:image')        

        
        #应用标题
        app_title = get_nodevalue(node_title[0]).encode('utf-8','ignore')
        #应用名称
        app_name = get_nodevalue(node_name[0]).encode('utf-8','ignore')
        #应用appStop URL
        app_url = get_nodevalue(node_appid[0]).encode('utf-8','ignore')
        #应用ID
        app_id = get_attrvalue(node_appid[0],'im:id').encode('utf-8','ignore')
        #应用类型ID
        app_category_id = get_attrvalue(node_app_category[0],'im:id').encode('utf-8','ignore')
        #应用类型名称
        app_category_name = get_attrvalue(node_app_category[0],'label').encode('utf-8','ignore')
        #获取应用金额
        app_price = get_attrvalue(node_price[0],'amount').encode('utf-8','ignore')
        #获取应用详情
        app_content = get_nodevalue(node_content[0]).encode('utf-8','ignore')
        #获取应用LOgo
        app_logo = get_nodevalue(node_logo[0]).encode('utf-8','ignore')
        rank = index+1
        app_tmp = {}
        app_tmp['rank'], app_tmp['title'], app_tmp['name'], app_tmp['url'], app_tmp['id'], app_tmp['category_id'], app_tmp['category_name'], app_tmp['price'], app_tmp['logo'], app_tmp['table'], app_tmp['type'], app_tmp['itunesid'] = (rank, app_title, app_name, app_url, app_id, app_category_id, app_category_name, app_price, app_logo, table, type, itunesid)
        app_list.append(app_tmp)

    return app_list


#配置抓取参数
def marge_url():
    url_list = []
    #国家
    country = ['cn','tw','hk','us']

    #大分类
    type = [['toppaidapplications',1],['topfreeapplications',2],['topgrossingapplications',3],['toppaidipadapplications',4],['topfreeipadapplications',5],['topgrossingipadapplications',6]]

    #小分类
    genre = ['6000','6001','6002','6003','6004','6005','6006','6007','6008','6009','6010','6011','6012','6013','6014','6015','6016','6017','6018','6020','6022','6023']
    
    for k,v in enumerate(country):
        for k1,v1 in enumerate(type):
            url = ['http://itunes.apple.com/',v,'/rss/',v1[0],'/limit=200/xml']
            urlStr = [''.join(url),v,v1[1],0]
            url_list.append(urlStr)
            for k2,v2 in enumerate(genre):
                url = ['http://itunes.apple.com/',v,'/rss/',v1[0],'/limit=200/genre=',v2,'/xml']
                urlStr = [''.join(url),v,v1[1],v2]
                url_list.append(urlStr)
    
    return url_list

#执行Sql操作
def DbSql(sql,type):
    try:
        if sql:
            conn = MySQLdb.connect(host="主机", user="用户", passwd="密码", db="数据库",charset='utf8')
            cursor = conn.cursor()
            cursor.execute(sql)    
            if type == 'select':
                data = cursor.fetchone()
            elif type == 'add':
                data = int(conn.insert_id())
            else:
                data = 'nokey'
        
            cursor.close()
            conn.commit()
            conn.close()
            return data
        else:
            return
    except MySQLdb.Error,e:
        errMsg = "Mysql Error %d: %s" %(e.args[0],e.args[1])
        print errMsg
        file_object = open('PythonLogo.log', 'w+')
        file_object.write(errMsg)
        file_object.close()


def addAppRank(app_list,startTime):
    for index,item in enumerate(app_list):
        sql = "SELECT app_id FROM app1_info_"+item.get('table')+" WHERE app_id_apple="+item.get('id')
        res = DbSql(sql,'select')
        if res is None:
            name = item.get('name').replace("\'", "\\\'").replace("\"","\\\"")
            addSql = "INSERT INTO app1_info_"+item.get('table')+"(`app_id_apple`,`app_type`,`app_name`,`app_itunesid`,`collect_country`,`app_logo`,`app_price`) VALUES(%s,'%s','%s','%s','%s','%s','%s')" %(item.get('id'),item.get('type'),name,item.get('category_id'),item.get('table'),item.get('logo'),item.get('price'))
            key_id = DbSql(addSql,'add')
            print 'insert'
        else:
            print 'select'
            key_id = res[0]        
    
        print key_id
        #查询上一次排名
        prevSql = "SELECT collect_now_rank FROM app1_collect_%s WHERE collect_app_id=%s AND collect_country='%s' AND collect_app_type=%s AND collect_app_itunesid=%s ORDER BY collect_ctime desc" %(item.get('table'),key_id,item.get('table'),item.get('type'),item.get('itunesid'))
        prevRes = DbSql(prevSql,'select')
        if prevRes is None:
            prev_rank = 0
        else:
            prev_rank = prevRes[0]
        
        collectSql = "INSERT INTO app1_collect_"+item.get('table')+"(`collect_app_id`,`collect_ctime`,`collect_now_rank`,`collect_prev_rank`,`collect_app_type`,`collect_app_itunesid`,`collect_country`) VALUES(%s,'%s','%s','%s','%s','%s','%s')" %(key_id,startTime,item.get('rank'),prev_rank,item.get('type'),item.get('itunesid'),item.get('table'))
        res = DbSql(collectSql,'addnokey')
    
    #暂停0.2秒
    time.sleep(0.2)


#q是任务队列
#NUM是并发线程总数
#JOBS是有多少任务
q = Queue.Queue()
NUM = 50
JOBS = marge_url()
startTime = int(time.time())
threads = []
mutex = threading.Lock()

class MyThread(threading.Thread):
    def __init__(self, work_queue,startTime):
        threading.Thread.__init__(self)
        self.work_queue = work_queue
    
    def run(self):
        global mutex
        while True:
            if self.work_queue.qsize()>0:
                arguments = self.work_queue.get()
                #threadname = threading.currentThread().getName()
                #print threadname,arguments
                    
                mutex.acquire()        
                #抓取网页
                app_string = get_appstop_data(arguments[0])
                    
                #解析
                app_arr = get_xml_data(app_string,arguments[1],arguments[2],arguments[3])
                #添加数据库
                addAppRank(app_arr,startTime)
                    
                mutex.release()
                time.sleep(1)
            else:
                break


if __name__ == '__main__':
    for work in JOBS:
        q.put(work)    
    
    for k in range(NUM):
        threads.append(MyThread(q,startTime))
    
    for t in threads:
        t.setDaemon(True)    
        t.start()

    for h in threads:
        h.join()
    
    print '主线程运行结束';
    

 cnblogs  不能上传文件, 需要源文件的可以 联系我!  QQ:516317457