淘宝1688商品爬取,适用于爬取频率不高的情况下
1.前提
基于python3.6 依赖包 selenium ,xlwt,pandas
需要根据自己chrome浏览器的版本下载对应的chromedriver
查看chrome版本号
点击 帮助 > 关于google
去下载对应的chromedriver : https://chromedriver.chromium.org/downloads
2.代码
#设置初始存放目录
#1,初始化文件夹
#2,设置需要爬取的链接
#3,对链接进行爬去
4,解析1688列表页
#5,持久化到表格
#6,标记已经爬过的链接
#7,合成一个表格
代码如下: 大家使用的时候,记得把方法都放到 main 之前,我是为了方便大家查看,才把mian方法写在前面
1 import os,xlwt,time 2 import pandas as pd 3 from bs4 import BeautifulSoup 4 from selenium import webdriver 5 from datetime import datetime 6 7 8 9 #chromedriver 地址,记得换成自己的 10 CHROME_DRIVER_PATH = '/Users/huangmengfeng/PycharmProjects/2020_study_plan/chromedriver' 11 12 13 14 if __name__ == '__main__': 15 #设置初始存放目录,记得换成自己的初始目录,只需要设置初始目录,后面的目录会自动生成,输出文件在out里面 16 base_dir_0 = '/Users/huangmengfeng/Desktop/1688_SCRIPY' 17 site_name_0 = 's.1688.com' 18 19 #1,初始化文件夹 20 tmp_data_dir_0,out_dir_0,record_path_0 = createDir(base_dir_0,site_name_0) 21 done_url = getDoneUrl(record_path_0) 22 #2,设置需要爬取的链接 23 need_url_list = [{'page_url':'https://s.1688.com/selloffer/offer_search.htm?keywords=%C5%B7%C3%C0%C5%AE%D7%B0&n=y&netType=1%2C11%2C16','cate_name':'欧美女装','total_num':50}] 24 #3,对链接进行爬去 25 for need_url in need_url_list: 26 for i in range(1,2): 27 now_page_url = "{0}&beginPage={1}#sm-filtbar".format(need_url['page_url'],i) 28 #跳过已经爬过的链接 29 if now_page_url not in done_url: 30 #4,解析1688列表页,模拟滑动,获得全部的商品,页面会有懒加载 31 soup_0 = get_dynamic_html2(now_page_url) 32 now_page_info_list = analysis1688Page(soup_0) 33 #5,持久化到表格 34 heads_0 = ['shop_name','shop_url','desc','url','price'] 35 exportToExcel(heads_0,now_page_info_list,tmp_data_dir_0,'{0}_{1}.xlsx'.format(need_url['cate_name'],i)) 36 #6,标记已经爬过的链接 37 addDoneUrl(record_path_0,now_page_url) 38 #7,合成一个表格 39 connectToOne(tmp_data_dir_0,out_dir_0,"{0}_{1}.xlsx".format(site_name_0,datetime.strftime(datetime.now(),'%Y%m%d%H%M%S'))) 40 41 42 43 #初始化文件夹 44 def createDir(base_dir,site_name): 45 tmp_data_dir_1 = "{0}/{1}".format(base_dir,site_name) 46 tmp_data_dir_2 = "{0}/{1}/{2}".format(base_dir,site_name,datetime.strftime(datetime.now(),'%Y%m%d')) 47 out_dir = '{0}/OUT'.format(base_dir) 48 path_arr = [base_dir,tmp_data_dir_1,tmp_data_dir_2,out_dir] 49 for path in path_arr: 50 if not os.path.exists(path): 51 os.mkdir(path) 52 record_path = '{0}/record.txt'.format(tmp_data_dir_2) 53 if not os.path.exists(record_path): 54 with open(record_path, 'a+', encoding="utf-8") as f: 55 f.write('') 56 f.close() 57 return tmp_data_dir_2,out_dir,record_path 58 59 60 61 # 下载动态界面 62 def get_dynamic_html2(site_url): 63 print('开始加载', site_url, '动态页面') 64 chrome_options = webdriver.ChromeOptions() 65 # ban sandbox 66 chrome_options.add_argument('--no-sandbox') 67 chrome_options.add_argument('--disable-dev-shm-usage') 68 # use headless 69 # chrome_options.add_argument('--headless') 70 chrome_options.add_argument('--disable-gpu') 71 chrome_options.add_argument('--ignore-ssl-errors') 72 driver = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, chrome_options=chrome_options) 73 # print('dynamic laod web is', site_url) 74 driver.set_page_load_timeout(100) 75 driver.set_window_size(1920, 1080) 76 # driver.set_script_timeout(100) 77 try: 78 driver.get(site_url) 79 except Exception as e: 80 driver.execute_script('window.stop()') # 超出时间则不加载 81 print(e, 'dynamic web load timeout') 82 83 time.sleep(2) 84 85 fullpage_screenshot(driver, 8000) 86 87 data2 = driver.page_source 88 soup2 = BeautifulSoup(data2, 'html.parser') 89 90 try: 91 time.sleep(3) 92 driver.quit() 93 except: 94 pass 95 return soup2 96 97 98 99 # 模拟滚动 100 def fullpage_screenshot(driver, total_height): 101 total_width = driver.execute_script("return document.body.offsetWidth") 102 # total_height = driver.execute_script("return document.body.parentNode.scrollHeight") 103 # total_height = 50000 104 viewport_width = driver.execute_script("return document.body.clientWidth") 105 viewport_height = driver.execute_script("return window.innerHeight") 106 rectangles = [] 107 108 i = 0 109 while i < total_height: 110 ii = 0 111 top_height = i + viewport_height 112 113 if top_height > total_height: 114 top_height = total_height 115 116 while ii < total_width: 117 top_width = ii + viewport_width 118 119 if top_width > total_width: 120 top_width = total_width 121 rectangles.append((ii, i, top_width, top_height)) 122 123 ii = ii + viewport_width 124 125 i = i + viewport_height 126 127 previous = None 128 part = 0 129 130 for rectangle in rectangles: 131 if not previous is None: 132 driver.execute_script("window.scrollTo({0}, {1})".format(rectangle[0], rectangle[1])) 133 print("Scrolled To ({0},{1})".format(rectangle[0], rectangle[1])) 134 time.sleep(0.5) 135 136 file_name = "part_{0}.png".format(part) 137 print("Capturing {0} ...".format(file_name)) 138 139 # driver.get_screenshot_as_file(file_name) 140 141 if rectangle[1] + viewport_height > total_height: 142 offset = (rectangle[0], total_height - viewport_height) 143 else: 144 offset = (rectangle[0], rectangle[1]) 145 146 print("Adding to stitched image with offset ({0}, {1})".format(offset[0], offset[1])) 147 part = part + 1 148 previous = rectangle 149 print("Finishing chrome full page screenshot workaround...") 150 return True 151 152 153 154 #解析1688列表页 155 def analysis1688Page(soup): 156 info_tag_list = soup.select('.sm-offer-item') 157 now_page_info_list = [] 158 for info_tag in info_tag_list: 159 skc = info_tag.attrs['trace-obj_value'] 160 a_tag_list = info_tag.select('a') 161 price_tag_lsit = info_tag.select('span.sm-offer-priceNum') 162 # print(info_tag) 163 img_tag = a_tag_list[0].select('img')[0] 164 165 shop_tag = a_tag_list[2] 166 desc = img_tag.attrs['alt'] 167 url = 'https://detail.1688.com/offer/{0}.html'.format(skc) 168 shop_name = shop_tag.text 169 shop_url = shop_tag.attrs['href'] 170 171 price = '无' 172 if len(price_tag_lsit) > 0: 173 price_tag = price_tag_lsit[0] 174 price = price_tag.attrs['title'] 175 print({'shop_name': shop_name, 'shop_url': shop_url, 'desc': desc, 'url': url, 'price': price}) 176 now_page_info_list.append( 177 {'shop_name': shop_name, 'shop_url': shop_url, 'desc': desc, 'url': url, 'price': price}) 178 return now_page_info_list 179 180 181 182 #将数据持久化到表格 183 def exportToExcel(heads,task_done,path,filename): 184 if not os.path.exists(path): 185 os.makedirs(path) 186 task_xls = xlwt.Workbook(encoding='utf-8') 187 task_sheet1 = task_xls.add_sheet('sheet1') 188 #表头 189 header_allign = xlwt.Alignment() 190 header_allign.horz = xlwt.Alignment.HORZ_CENTER 191 header_style = xlwt.XFStyle() 192 header_style.alignment = header_allign 193 for i in range(len(heads)): 194 task_sheet1.col(i).width = 12000 195 task_sheet1.write(0,i,heads[i],header_style) 196 #开始插入 197 for i in range(len(task_done)): 198 for j in range(len(heads)): 199 task_sheet1.write(i+1,j,task_done[i][heads[j]]) 200 print(os.path.join(path,filename)) 201 task_xls.save(os.path.join(path,filename)) 202 return filename 203 204 205 206 #获得已经爬去的链接 207 def getDoneUrl(path): 208 done_url = [] 209 with open(path, 'r', encoding="utf-8") as f: 210 url_list = f.readlines() 211 for url in url_list: 212 done_url.append(url.rstrip('\n')) 213 print(done_url) 214 return done_url 215 216 217 #将爬去过的url记录下来 218 def addDoneUrl(path,content): 219 try: 220 with open(path, 'a+', encoding="utf-8") as f: 221 f.write(content + '\n') 222 f.close() 223 except Exception as e: 224 print(e) 225 226 227 228 #汇总全部数据 229 def connectToOne(dir, to_dir, out_file_name): 230 excel_df = pd.DataFrame() 231 for file in os.listdir(dir): 232 if file.endswith('.xlsx'): 233 print("file:", file) 234 excel_df = excel_df.append( 235 pd.read_excel(os.path.join(dir, file), dtype={'url': str}, )) 236 print('开始合并') 237 excel_df['currency'] = '$' 238 writer = pd.ExcelWriter(os.path.join(to_dir, out_file_name), engine='xlsxwriter', 239 options={'strings_to_urls': False}) 240 241 excel_df.to_excel(writer,index=False) 242 writer.close()