噢百万结果抓取

 

 

import requests
import re
from lxml import etree

'''噢百万抓取'''

url = 'http://www.obaiwan.com/hk49/results/'

p = re.compile('''<tr >\r\n<td  >.+?</td>\r\n<td  >(.+?)</td>\r\n<td >(.+?)</td>\r\n<td ><b style=".+?">(.+?)</b></td>\r\n<td ><b style=".+?">(.+?)</b></td>\r\n<td ><b style=".+?">(.+?)</b></td>\r\n<td ><b style=".+?">(.+?)</b></td>\r\n<td ><b style=".+?">(.+?)</b></td>\r\n<td ><b style=".+?">(.+?)</b></td>\r\n<td  >.+?</td>\r\n<td >.+?</td>\r\n<td >.+?</td>\r\n<td >.+?</td>\r\n<td >.+?</td>\r\n<td >.+?</td>\r\n<td >.+?</td>\r\n<td ><b style=".+?">(.+?)</b></td>\r\n</tr>''')

f = open('history.txt','w')
res = ''

for i in range(2003, 2016):
    year = i
    data = {'qinum':year,'submit':'%CC%E1%BD%BB%B2%E9%D1%AF'}
    r = requests.post(url, data=data)
    r.encoding = 'gb2312'
    matchs = p.findall(r.text)
    for row in matchs:
        res += ','.join(row) + '\n'
        
f.write(res)
f.close()

 

posted @ 2015-04-08 15:03  罗兵  阅读(840)  评论(0编辑  收藏  举报