1kkk
给基友下载漫画看
代码:
1 # !usr/bin/python3.4 2 # -*- coding:utf-8 -*- 3 4 import requests 5 import os 6 import time 7 import re 8 from lxml import etree 9 import random 10 11 def geturl(url,postdata): 12 header = {'User-Agent': 13 'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5', 14 'Referer':'http://m.1kkk.com/vol1-6871/', 15 'Host': 'manhua1023.61-174-50-131.cdndm5.com', 16 'Accept': 'image/png,image/*;q=0.8,*/*;q=0.5', 17 'Accept-Encoding': 'gzip, deflate', 18 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 19 'Connection': 'keep-alive', 20 } 21 22 s = requests.Session() 23 r = s.post('http://m.1kkk.com/userdata.ashx',data = postdata) 24 _cookies = r.cookies 25 #print(r.content) 26 rs = s.get(url, headers=header,cookies = _cookies) 27 return rs 28 29 30 def get(url): 31 header = {'User-Agent': 32 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0', 33 'Referer': 'http://www.1kkk.com/manhua589/', 34 'Host': 'www.1kkk.com'} 35 36 # 解析网页 37 html_bytes = requests.get(url, headers=header) 38 39 return html_bytes 40 41 def mget(url): 42 header = {'User-Agent': 43 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0', 44 'Referer': 'http://m.1kkk.com/manhua589/', 45 'Host': 'm.1kkk.com'} 46 47 # 解析网页 48 html_bytes = requests.get(url, headers=header) 49 50 return html_bytes 51 52 53 # 去除标题中的非法字符 (Windows) 54 def validateTitle(title): 55 # '/\:*?"<>|' 56 rstr = r"[\/\\\:\*\?\"\<\>\|]" 57 new_title = re.sub(rstr, "", title) 58 return new_title 59 60 61 def prints(timesleep): 62 print('暂停' + str(timesleep) + '秒后开始批量下载图片,请保持网络畅通...') 63 time.sleep(timesleep) 64 65 # 解析js 66 def regnext(js): 67 reg = r'(var.+?.split)' 68 all = re.compile(reg); 69 alllist = re.findall(all, js) 70 return alllist 71 72 # 递归创建文件夹 73 def createjia(path): 74 try: 75 os.makedirs(path) 76 except: 77 print('目录已经存在:' + path) 78 79 80 if __name__ == '__main__': 81 82 html = get('http://www.1kkk.com/manhua589/').content.decode('utf-8', 'ignore') 83 84 page = etree.HTML(html.lower()) 85 # 得到网址后缀 86 hrefs = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/a/@href') 87 # 得到编号 88 hrefnames = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/a/text()') 89 # 得到页数 90 hrefpages = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/text()') 91 92 href = [] 93 hrefname = [] 94 hrefpage = [] 95 number = 1 96 97 # 不知道里面那几卷是不是漫画里面的 98 # 先抓下来再说 99 # 得到网址后缀 100 for temp in hrefs: 101 towurl = temp 102 href.append(towurl) 103 # 得到编号 104 for temp in hrefnames: 105 hrefname.append(temp) 106 # 得到页数 107 for temp in hrefpages: 108 hrefpage.append(temp.replace("页", "")) 109 110 j = 0 111 filenamep = '../data/' + str(hrefname[0]) + "/" 112 createjia(filenamep) 113 114 for i in range(0, len(href)): 115 for j in range(len(hrefpage)): 116 117 # 6871、6872。。 118 hrefnumber = str(href[i]).replace("ch54-","").replace("/","").replace("vol1-","") 119 #print(hrefnumber) 120 # 构造jsurl 121 # 得到 122 # http://www.1kkk.com/vol1-6871/imagefun.ashx?cid=6871&page=1&key=65abd421f4aed565&maxcount=10 123 jsurl = "http://www.1kkk.com" + str(href[i]) + "/imagefun.ashx?cid=" + str(hrefnumber) + "&page=" + str(j + 1) + "&key=65abd421f4aed565&maxcount=10" 124 print(jsurl) 125 126 # 构造image网址 127 html = get(jsurl).content.decode('utf-8', 'ignore') 128 html1 = regnext(html) 129 html1 = html1[0].replace("'.split", "").split('|') 130 131 # http://manhua1023.61-174-50-131.cdndm5.com/1/589/6871/102_9224.jpg?cid=6871&key=d8ce90e0b3f013f292ef77e84da88990&type=1 132 image_1url = "http://manhua1023." + str(html1[19]) + "-" + str(html1[18]) + "-" + str(html1[9]) + "-" + str( 133 html1[10]) + ".cdndm5.com/1/589/" + str(href[i]) + "/" + str(html1[20]) + "?cid=" + str(6871) + "&key=" + str( 134 html1[8]) + "&type=1" 135 print(image_1url) 136 137 # 构造image网址 138 filess = open(filenamep + str(j + 1) + '.jpg', 'wb') 139 140 # 伪装posrdata 141 postdata = { 142 'cid': 6871, 143 'language': 1, 144 'mid': 589, 145 'page': j + 1, 146 'tp': 8, 147 'uid': 0 148 } 149 150 # 即使正确的网址也是不能下载 151 pic = geturl(image_1url,postdata) 152 filess.write(pic.content) 153 filess.close() 154 print('已经写入第' + str(j + 1) + '张图片') 155 j = j + 1 156 157 # 每一次下载都暂停1-3秒 158 loadimg = random.randint(1, 3) 159 print('暂停' + str(loadimg) + '秒') 160 time.sleep(loadimg)
selenium抓取:
1 #!/usr/bin/python3.4 2 # -*- coding: utf-8 -*- 3 4 from selenium import webdriver 5 import time 6 from selenium.webdriver.common.keys import Keys 7 import re 8 9 10 # 去除标题中的非法字符 (Windows) 11 def validateTitle(title): 12 rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/\:*?"<>|' 13 new_title = re.sub(rstr, "", title) 14 return new_title 15 16 17 def getimg(): 18 # http://www.cnblogs.com/fnng/p/3238685.html 19 # 打开火狐浏览器 20 # browser = webdriver.Chrome() 21 browser = webdriver.Firefox() 22 23 # 设置浏览器大小 24 browser.set_window_size(1200, 900) 25 # 输入网址 26 browser.get("http://m.1kkk.com/vol1-6871/") 27 # 根据各自网速来判断网址加载时间 28 time.sleep(10) 29 30 for i in range(10000): 31 32 # 关掉广告 33 browser.find_element_by_class_name("ad_cross").click() 34 35 # 翻页到最后面 36 browser.execute_script(""" 37 (function () { 38 var y = 0; 39 var step = 100; 40 window.scroll(0, 0); 41 42 function f() { 43 if (y < document.body.scrollHeight) { 44 y += step; 45 window.scroll(0, y); 46 setTimeout(f, 100); 47 } else { 48 window.scroll(0, 0); 49 document.title += "scroll-done"; 50 } 51 } 52 53 setTimeout(f, 1000); 54 })(); 55 """) 56 print("下拉中...") 57 #time.sleep(180) 58 while True: 59 if "scroll-done" in browser.title: 60 break 61 else: 62 print("还没有拉到最底端...") 63 time.sleep(10) 64 65 # while True: 66 # # 判断是否存在这个东西 67 # select = browser.find_element_by_xpath('//a[@class="readTipForm"]') 68 # if select: 69 # break 70 # else: 71 # print("还没有拉到最底端...") 72 # time.sleep(60) 73 74 print("正在下载图片中...") 75 # 图片的命名 76 name = validateTitle(browser.current_url) 77 print("正在截图...") 78 time.sleep(5) 79 80 # 截图 81 browser.save_screenshot("../jpg/cartoon/" + str(i + 1) + str(name) + ".png") 82 time.sleep(5) 83 84 # 点击阅读下一章 85 browser.find_element_by_class_name("readTipForm").click() 86 print("准备进入下一章...") 87 time.sleep(5) 88 89 browser.quit() 90 91 92 if __name__ == '__main__': 93 getimg()