github文件的爬取
第一次写爬虫,爬取了简单的百度搜索界面的信息,感觉要成为爬虫大佬,要走的路还很长。所以这次就挑战一下更高的难度,爬取github上的信息。
过程是先在github上搜索关键词,搜索后刷新出来搜索结果的页面,再根据搜索结果页面提取出每条结果的URL链接,可根据自己想要的文件进行函数过滤或者正则过滤,保存到list里,然后逐条访问url链接,获取该url链接的页面,用xpath或者正则表达式提取自己想要的信息,最后保存到本地文件夹,欢迎大佬们批评指正呀!
1 #_*_encoding:utf-8_*_ 2 import urllib 3 from urllib import request 4 import requests 5 from lxml.html import etree 6 from selenium import webdriver 7 import time 8 9 def page(number): 10 data = {} 11 #Language 12 data['l'] = 'XML' 13 #page 14 data['p'] = number 15 #添加搜索关键词 16 data['q'] = 'xml password ' 17 data['type'] = 'Code' 18 url_values = urllib.parse.urlencode(data) 19 url = 'https://github.com/search?' 20 full_url = url + url_values 21 full_url = urllib.parse.unquote(full_url) 22 print(full_url) 23 return full_url 24 25 #获取url页面 26 def pro(item): 27 28 driver.get(item) 29 content = driver.page_source 30 try : 31 content = content.decode('UTF-8') 32 except: 33 print("爬取成功") 34 else: 35 print("解码...") 36 #构造一个XPath解析对象并对HTML文本进行自动修正 37 content = etree.HTML(content) 38 return content 39 40 def makeurl(content): #获取页面url的链接 41 #找到大标签 42 codelist = content.xpath("/html/body/div[4]/main/div/div[3]/div/div[2]/div[1]/*") 43 #大标签内有10个链接 44 for item in range(len(codelist)): 45 url = codelist[item].xpath("./div[1]/div[2]/a/@href") 46 url = "https://github.com"+url[0] 47 check(url) 48 49 #获取xml文件 50 def getxmlcontent(count): 51 driver.get(urls[count]) 52 time.sleep(2) 53 print(urls[count]) 54 content = driver.page_source 55 try : 56 content = content.decode('UTF-8') 57 except: 58 print("爬取成功") 59 else: 60 print("继续...") 61 content = etree.HTML(content) 62 #找到大标签 63 xmlcontent = content.xpath("/html/body/div[4]/div/main/div[3]/div/div[3]/div[2]/table/tbody/*") 64 #存储xml文件 65 xmlcontentss = [] 66 #逐行提取 67 for item in range(len(xmlcontent)): 68 xmlcontents = ''.join(xmlcontent[item].xpath(".//td[2]//text()")) 69 xmlcontentss.append(xmlcontents+"\r\n") 70 #转化为字符串 71 xmlcontentss = ''.join(xmlcontentss) 72 return xmlcontentss 73 74 #将爬取的数据写入文件 75 def save(i,xmlcontents): 76 filename = time.strftime('%Y%m%d-%H%M%S ',time.localtime(time.time())) 77 filename = str(filename) 78 file =r"D:\Myfiles\xmlwhite\gitxml-14\\" +filename+ ".xml" 79 f = open(file,'a+') 80 f.write(xmlcontents) 81 f.close() 82 83 #检查是否为xml后缀的文件 84 def check(isxml): 85 if isxml.endswith("xml") : 86 urls.append(isxml) 87 88 89 if __name__ == "__main__": 90 baseurl = "https://github.com/login" 91 driver = webdriver.Chrome() 92 driver.get(baseurl) 93 driver.find_element_by_id("login_field").send_keys("账号") 94 driver.find_element_by_id("password").send_keys("密码") 95 driver.find_element_by_name("commit").click() 96 time.sleep(20) 97 i = 0 98 s = 0 99 k = 0 100 #github上最多只显示100页搜索结果,要想爬取到一定数量的文件是不可能的了。。。。可以更换关键词搜索。。。。 101 for pitem in range(100): 102 urls =[] 103 makeurl(pro(page(pitem+1))) 104 print(urls) 105 print("-----------------第%d页"%(pitem+1)) 106 for item in range(len(urls)): 107 i = i+1 108 try: 109 xmlcontents = getxmlcontent(item) 110 if xmlcontents : 111 s = s + 1 112 save(s,xmlcontents) 113 else : 114 k = k+1 115 print("-----------------当前文件内容为空!") 116 except: 117 print("-----------------写入失败...") 118 else: 119 print("-----------------继续抓取ing") 120 print("第%d页:正在抓取第%d个文件,已经抓取到%s个文件,空数据文件%d个,抓取下一个文件..."%((pitem+1),i,s,k)) 121 print("-----------------此次数据爬取已完成!--------------")