python之 MySQLdb 实践 爬一爬号码
0.目录
2.构建URL
3.新建数据库
4.新建汇总表
5.定义连接数据库函数:connect_db(db=None, cursorclass=DictCursor)
6.汇总表填充必要数据
7.新建各省份子表
8.完整代码
1.参考
2.构建URL
3.新建数据库
mysql> CREATE DATABASE mobile -> CHARACTER SET 'utf8' -> COLLATE 'utf8_general_ci'; Query OK, 1 row affected (0.00 sec) mysql> use mobile; Database changed
4.新建汇总表
int(M) M是显示宽度,无需设置。
int 带符号表示范围 [-2147483648,2147483647] ,手机号码11位数字已超。
mysql> use mobile Database changed mysql> CREATE TABLE china( -> id INT NOT NULL auto_increment, -> province VARCHAR(100) NOT NULL, -> city VARCHAR(100) NOT NULL, -> num_count INT NULL, -> new_time DATETIME NULL, -> update_time DATETIME NULL, -> latest_num BIGINT NULL, -> province_zh VARCHAR(100) NOT NULL, -> city_zh VARCHAR(100) NOT NULL, -> url VARCHAR(255) NOT NULL, -> -> PRIMARY KEY(id) -> )engine=InnoDB DEFAULT CHARSET=utf8; Query OK, 0 rows affected (0.01 sec)
5.定义连接数据库函数:connect_db(db=None, cursorclass=DictCursor)
import MySQLdb from MySQLdb.cursors import Cursor, DictCursor def connect_db(db=None, cursorclass=DictCursor): conn= MySQLdb.connect( host='127.0.0.1', #'localhost' port = 3306, user='root', passwd='root', # db ='mobile', charset='utf8' ) curs = conn.cursor(cursorclass) if db is not None: curs.execute("USE %s"%db) return conn, curs
conn, curs = connect_db('mobile')
6.汇总表填充必要数据
info数据构成:
In [105]: info = pickle.load(open('10010')) In [106]: info[0] Out[106]: ('18xxxxx3664', #此处隐藏号码细节 u'\u6cb3\u5317', u'\u79e6\u7687\u5c9b\u5e02', u'hebei', u'qinhuangdaoshi', 'https://m.1xxxx.com/xxxxxxxxxx') #此处隐藏url细节 In [107]: info = [dict(province_zh=i[1], city_zh=i[2], province=i[3], city=i[4], url=i[-1]) for i in info] In [108]: info[0] Out[108]: {'city': u'qinhuangdaoshi', 'city_zh': u'\u79e6\u7687\u5c9b\u5e02', 'province': u'hebei', 'province_zh': u'\u6cb3\u5317', 'url': 'https://m.1xxxx.com/xxxxxxxxxx'} #此处隐藏url细节
# python中的排序问题——多属性排序
# https://www.2cto.com/kf/201312/265675.html
In [114]: info = sorted(info, key=lambda x:(x.get('province'), x.get('city')))
插入必要数据:
In [115]: curs.executemany(""" ...: INSERT INTO china(province, city, province_zh, city_zh, url) ...: values(%(province)s,%(city)s,%(province_zh)s,%(city_zh)s,%(url)s)""", info) ...: conn.commit()
山西和陕西的拼音相同
mysql> select count(distinct province),count(distinct city),count(distinct province_zh),count(distinct city_zh),count(distinct url) from china; +--------------------------+----------------------+-----------------------------+-------------------------+---------------------+ | count(distinct province) | count(distinct city) | count(distinct province_zh) | count(distinct city_zh) | count(distinct url) | +--------------------------+----------------------+-----------------------------+-------------------------+---------------------+ | 30 | 326 | 31 | 331 | 339 | +--------------------------+----------------------+-----------------------------+-------------------------+---------------------+ 1 row in set (0.00 sec)
修正 陕西 为 shanxi3
mysql> set character_set_client = gbk;
Query OK, 0 rows affected (0.00 sec)
mysql> set character_set_results = gbk;
Query OK, 0 rows affected (0.00 sec)
mysql> UPDATE china -> SET province = 'shanxi3' -> WHERE province_zh = '陕西'; Query OK, 10 rows affected (0.01 sec) Rows matched: 10 Changed: 10 Warnings: 0
如果使用curs 进行 update 操作需要 conn.commit()
In [99]: curs.execute(""" ...: UPDATE china ...: SET province = 'shanxi3' ...: WHERE province_zh = '陕西'; ...: """) "\nUPDATE china\nSET province = 'shanxi3'\nWHERE province_zh = '\xe9\x99\x95\xe8\xa5\xbf';\n" Out[99]: 10L In [100]: conn.commit() In [101]: curs.fetchall() Out[101]: ()
7.新建各省份子表
curs.execute(""" SELECT DISTINCT province FROM china """) table_names = [i.get('province') for i in curs] # id 字段在这里并不是必要 # UNIQUE 用于后续插入行时对已有号码只更新原记录的部分属性 on duplicate # FOREIGN KEY 用于同步跟随汇总表 for table_name in table_names: curs.execute(""" CREATE TABLE %s( id INT NOT NULL AUTO_INCREMENT, china_id INT NOT NULL, num BIGINT NOT NULL, times INT NULL DEFAULT 1, update_time datetime NULL, head INT(3) NULL, mid INT(4) ZEROFILL NULL, tail INT(4) ZEROFILL NULL, mid_match CHAR(4) NULL, tail_match CHAR(4) NULL, PRIMARY KEY(id), UNIQUE KEY(num), FOREIGN KEY (china_id) REFERENCES china(id) ON DELETE CASCADE ON UPDATE CASCADE )ENGINE=InnoDB DEFAULT CHARSET=utf8;"""%(table_name))
参看索引 index
mysql> show index from anhui; +-------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+---------------+ | Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | +-------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+---------------+ | anhui | 0 | PRIMARY | 1 | id | A | 0 | NULL | NULL | | BTREE | | | | anhui | 0 | num | 1 | num | A | 0 | NULL | NULL | | BTREE | | | | anhui | 1 | china_id | 1 | china_id | A | 0 | NULL | NULL | | BTREE | | | +-------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+---------------+
8.完整代码
#!/usr/bin/env python # -*- coding: UTF-8 -* import time import re import MySQLdb from MySQLdb.cursors import Cursor, DictCursor import json import traceback import threading lock = threading.Lock() import Queue task_queue = Queue.Queue() result_queue = Queue.Queue() import requests from requests.exceptions import (ConnectionError, ConnectTimeout, ReadTimeout, SSLError, ProxyError, RetryError, InvalidSchema) s = requests.Session() s.headers.update({'user-agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_3_5 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Mobile/13G36 MicroMessenger/6.5.12 NetType/4G'}) # 此处隐藏 Referer 细节,也可不用 # s.headers.update({'Referer':'https://servicewechat.com/xxxxxxxxxxx'}) s.verify = False s.mount('https://', requests.adapters.HTTPAdapter(pool_connections=1000, pool_maxsize=1000)) import copy sp = copy.deepcopy(s) proxies = {'http': 'http://127.0.0.1:3128', 'https': 'https://127.0.0.1:3128'} sp.proxies = proxies from urllib3.exceptions import InsecureRequestWarning from warnings import filterwarnings filterwarnings('ignore', category = InsecureRequestWarning) import logging def get_logger(): logger = logging.getLogger("threading_example") logger.setLevel(logging.DEBUG) # fh = logging.FileHandler("d:/threading.log") fh = logging.StreamHandler() fmt = '%(asctime)s - %(threadName)-10s - %(levelname)s - %(message)s' formatter = logging.Formatter(fmt) fh.setFormatter(formatter) logger.addHandler(fh) return logger logger = get_logger() def connect_db(db=None, cursorclass=DictCursor): conn= MySQLdb.connect( host='127.0.0.1', #'localhost' port = 3306, user='root', passwd='root', # db ='mobile', charset='utf8' ) curs = conn.cursor(cursorclass) if db is not None: curs.execute("USE %s"%db) return conn, curs def read_table_china(): dict_conn, dict_curs = connect_db('mobile',DictCursor) dict_curs.execute(""" SELECT id, province, province_zh, city_zh, url FROM china """) table_china = dict_curs.fetchall() dict_conn.close() return table_china def add_task(): while True: if task_queue.qsize() < 1000: for t in table_china: task_queue.put(t) #队列里面依旧只有300多个id,重复引用,后续get还是得dict loop = 0 def do_task(): global loop while True: task = dict(task_queue.get()) ########### nums = get_nums(task.get('url')) if nums is None: continue task.update(dict(update_time=time.strftime('%y-%m-%d %H:%M:%S'))) # results用于后续更新子表,executemany insert 效率更高 results = [] for num in nums: result = dict(task) ########### match_dict = parse_num(num) result.update(match_dict) results.append(result) result_queue.put(results) task_queue.task_done() # 记得加u logger.debug(u'{} tasks:{} results:{} threads: {} loop: {} {}_{}'.format(nums[-1], task_queue.qsize(), result_queue.qsize(), threading.activeCount(), loop, task.get('province_zh'), task.get('city_zh'))) with lock: loop += 1 def get_nums(url): try: url = url + str(int(time.time()*1000)) resp = sp.get(url)#, timeout=10) #加上超时,网络性能矩形陡降 rst = resp.content # rst = rst[rst.index('{'):rst.index('}')+1] m = re.search(r'({.*?})', rst) match = m.group() rst = json.loads(match) nums = [num for num in rst['numArray'] if num>10000] # nums_len = len(nums) # assert nums_len == 10 assert nums != [] return nums except (ConnectionError, ConnectTimeout, ReadTimeout, SSLError, ProxyError, RetryError, InvalidSchema) as err: pass except (ValueError, AttributeError, IndexError) as err: pass except AssertionError as err: pass except Exception as err: print err,traceback.format_exc() # 解析号码特征 def parse_num(num): # num = 18522223333 num_str = str(num) head = num_str[:3] mid = num_str[3:7] tail = num_str[-4:] match_dict = {'mid_match':mid, 'tail_match':tail} for k,v in match_dict.items(): part_1, part_2, part_3, part_4 = [int(i) for i in v] if part_1-part_2==part_2-part_3==part_3-part_4== -1: match_dict[k] = 'ABCD' elif part_1-part_2==part_2-part_3==part_3-part_4== 1: match_dict[k] = 'DCBA' elif part_1==part_2==part_3==part_4: match_dict[k] = 'SSSS' elif part_2==part_3 and (part_1==part_2 or part_3==part_4): match_dict[k] = '3S' elif part_1==part_2 and part_3==part_4: match_dict[k] = 'XXYY' elif part_1==part_3 and part_2==part_4: match_dict[k] = 'XYXY' elif part_1==part_3 and k == 'mid_match': match_dict[k] = 'XYXZ' else: match_dict[k] = None match_dict.update(dict(num=num, head=int(head), mid=int(mid), tail=int(tail))) return match_dict def update_table_province(): conn, curs = connect_db('mobile', Cursor) while True: try: results = result_queue.get() prefix = "insert into %s"%(results[0].get('province')) # 已经设置 num 字段为unique,如果可能导致重复,则更新 update_time , 同时 times 加1 sql = (prefix+\ """(china_id, num, update_time, head, mid, tail, mid_match, tail_match) values(%(id)s, %(num)s, %(update_time)s, %(head)s, %(mid)s, %(tail)s, %(mid_match)s, %(tail_match)s) ON DUPLICATE KEY UPDATE update_time=values(update_time), times=times+1""") curs.executemany(sql, results) conn.commit() result_queue.task_done() except Exception as err: # pass print err,traceback.format_exc() result_queue.put(results) try: conn.close() except: pass conn, curs = connect_db('mobile', Cursor) def update_table_china(): dict_conn, dict_curs = connect_db('mobile', DictCursor) province_list = set([info.get('province') for info in table_china]) while True: try: for province in province_list: # 先按照id降序,最后获取的新号码靠前 dict_curs.execute(""" SELECT china_id, count(*) AS num_count, update_time AS new_time FROM (select * from %s order by id desc) AS temp GROUP BY china_id; """%(province)) # dict_curs的复用? dict_curs.executemany(""" UPDATE china SET num_count = %(num_count)s, new_time = %(new_time)s WHERE id = %(china_id)s; """, dict_curs) dict_conn.commit() # 先按照更新时间降序 dict_curs.execute(""" SELECT china_id, update_time, num AS latest_num FROM (select * from %s order by update_time desc) AS temp GROUP BY china_id; """%(province)) dict_curs.executemany(""" UPDATE china SET update_time = %(update_time)s, latest_num = %(latest_num)s WHERE id = %(china_id)s; """, dict_curs) dict_conn.commit() time.sleep(300) except Exception as err: # pass print err,traceback.format_exc() try: dict_conn.close() except: pass dict_conn, dict_curs = connect_db('mobile', DictCursor) if __name__ == '__main__': table_china = read_table_china() threads = [] t = threading.Thread(target=add_task) #args接收元组,至少(a,) threads.append(t) for i in range(500): t = threading.Thread(target=do_task) threads.append(t) for i in range(20): t = threading.Thread(target=update_table_province) threads.append(t) t = threading.Thread(target=update_table_china) threads.append(t) # for t in threads: # t.setDaemon(True) # t.start() # while True: # pass for t in threads: t.start() # for t in threads: # t.join() while True: pass
9.运行结果