python 爬虫获取世界杯比赛赛程
#!/usr/bin/python # -*- coding:utf8 -*- import requests import re import os import time # from urllib import json from bs4 import BeautifulSoup from datetime import date def getTimeExpire(time_play,time_gap): # print(time_play) try: time_arr=time.strptime(time_play,"%Y-%m-%d %H:%M:%S") except: print('时间转化失败') return '' else: t1=time.mktime(time_arr) x = time.localtime(t1+time_gap)#是秒不是毫秒 return time.strftime('%Y-%m-%d %H:%M:%S',x) def getHtml(): #改成从网站直接获取,但是网站需要分页 with open('F:\\test\\python\\worldcup.html', 'r',encoding='utf-8') as f: content = f.read() soup = BeautifulSoup(content,'lxml') nodes=soup.select('.b-pull-refresh-content > div') arr=[] #写入CSV文件的头部 filename = "F:\\test\\python\\worldcup.csv" f = open(filename,'a') f.writelines('team1,team2,time_expire,time_play \n') f.close() for node in nodes: date = node.select('.wa-match-schedule-list-title')[0].get_text().strip() datas = node.select('.sfc-contacts-list .wa-match-schedule-list-item') for d in datas: obj={'team1':'','team2':'','time':''} obj['team1']=d.select('.wa-tiyu-schedule-item-name.c-line-clamp1')[0].get_text().strip() obj['team2']=d.select('.wa-tiyu-schedule-item-name.c-line-clamp1')[1].get_text().strip() obj['time_play']='2018-'+date[2:8]+''+d.select('.status-text')[0].get_text().strip()+':00' obj['time_expire']=getTimeExpire(obj['time_play'],-10*60) filename = "F:\\test\\python\\worldcup.csv" f = open(filename,'a') f.writelines(obj['team1']+','+obj['team2']+','+obj['time_expire']+','+obj['time_play']+'\n') f.close() #getHtml() def getFromAPI(): month=6 day=11 #从2018-06-14 到 07-15 for d in range(0,15): day+=2 if day>30: month+=1 day=1 url="http://tiyu.baidu.com/api/match/%E4%B8%96%E7%95%8C%E6%9D%AF/live/date/2018-"+str(month)+'-'+str(day)+"/direction/after?from=self" time.sleep(1) data = json.loads(requests.get(url,timeout=3).text) if(data['status']=='0'): print('为0') for matches in data['data']: for m in matches['list']: filename = "F:\\test\\python\\worldcupFromAPI.csv" f = open(filename,'a') if m['startTime']>time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()): f.writelines(m['leftLogo']['name']+','+m['rightLogo']['name']+','+getTimeExpire(m['startTime'],-10*60)+','+m['startTime']+'\n') f.close() getFromAPI()