class INTERFACING(): def __init__(self): self.driver_initialized = False self.driver = '' self.MAX_TRIALS = 2 # self.chrome_version = get_google_chrome_version() def make_soup(self): return BeautifulSoup(self.driver.page_source, 'lxml') # etree.HTML() def current_url(self): return self.driver.current_url def get_driver(self): # uc.TARGET_VERSION = get_google_chrome_version() chrome_options = uc.ChromeOptions() # chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920.,1080") chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-popup-blocking") chrome_options.add_argument("--profile-directory=Default") chrome_options.add_argument("--ignore-certificate-errors") chrome_options.add_argument("--disable-plugins-discovery") chrome_options.add_argument("--incognito") chrome_options.add_argument("--no-first-run") chrome_options.add_argument("--no-service-autorun") chrome_options.add_argument("--no-default-browser-check") chrome_options.add_argument("--password-store=basic") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument('--disable-application-cache') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument("--disable-setuid-sandbox") chrome_options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" ) self.driver = uc.Chrome(options=chrome_options, version_main="113") # self.browser = uc.Chrome(options=chrome_options, version_main=113) time.sleep(10) self.driver_initialized = True def close_driver(self): self.driver.quit() def get_selenium_response(self, url): # try: if not self.driver_initialized: self.get_driver() else: pass self.driver.get(url) time.sleep(3) soup = self.make_soup() return soup def get_page_source(self): return self.driver.page_source def clicking(self, xpath): elem = self.driver.find_element(By.XPATH, xpath) elem.click() time.sleep(random.randint(2, 3)) def entering_values(self, xpath, value): elem = self.driver.find_element(By.XPATH, xpath) elem.clear() elem.send_keys(value) time.sleep(random.randint(2, 4)) def send_keys(self, xpath): elem = self.driver.find_element(By.XPATH, xpath).send_keys(Keys.RETURN) def going_back(self): self.driver.execute_script("window.history.go(-1)") time.sleep(1) def refresh_page(self): self.driver.refresh() def close_handle(self): self.driver.close() def get_current_handle(self): return self.driver.current_window_handle def get_all_handles(self): return self.driver.window_handles def swtich_to_window(self, handle): self.driver.switch_to.window(handle) def switch_handle(self, second_handle=''): all_handles = self.get_all_handles() for handle in all_handles: self.main_page_handle = self.get_current_handle() if handle == self.main_page_handle: continue if second_handle and handle == second_handle: continue self.swtich_to_window(handle) return handle def close_handles(self, page_handle, second_handle): all_handles = self.get_all_handles() for handle in all_handles: if handle == page_handle: try: self.close_handle() except: pass self.swtich_to_window(second_handle) def skip_button(self, class_item): count = 0 while 1: soup = self.make_soup() try: self.clicking(f'//a[contains(@class,"{class_item}")]') break except Exception as error: print('skip button not yet visible') if count > 3: try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) return True except: if soup.find('span', class_='el-table__empty-text') is not None: return True try: self.clicking('//span[text()="Medical Devices"]') break except: pass time.sleep(2) count += 1 if count == 20: break def search_data(self, current_query,page_num): if page_num == 1: self.entering_values('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/input',current_query) self.clicking('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/div/button') second_handle = self.switch_handle() self.skip_button('introjs-nextbutton') self.skip_button("introjs-skipbutton") soup = self.make_soup() if soup.find('span', class_='el-table__empty-text') is not None: pass else: print('Selecting 20 per page...') count = 0 while True: soup = self.make_soup() try: page_selector = soup.find('input', class_='el-input__inner') if page_selector.attrs.get("placeholder"): break except Exception as error: print('Record not yet loaded: ', count) time.sleep(3) count += 1 if not count % 3: print('page refreshed....') self.refresh_page() if count >= 51: break self.clicking('//*[@id="home"]/div[3]/div[3]/div/div/span[2]/div/div[1]/input') self.clicking( '//ul[@class="el-scrollbar__view el-select-dropdown__list"]//span[text()="20条/页"]') if page_num != 1: while 1: try: self.entering_values('//input[@type="number"]',page_num) break except: print('error in entering page num') time.sleep(3) self.send_keys('//input[@type="number"]') time.sleep(3) while 1: soup = self.make_soup() try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) except: if soup.find('span', class_='el-table__empty-text') is not None: print('No Results...') all_results = [] total_results = int(soup.find('span', class_='el-pagination__total').text.strip().split()[1]) ending_page = total_results // 20 + 1 while 1: # sometimes it takes long to load all the records on the page, so here making sure we loaded all 20 records or # if not then making usre it's the last page soup = self.make_soup() all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) if len(all_results) == 20: break if len(all_results) < 20 and ending_page == page_num: break print(all_results, " : ", total_results, " : ", ending_page) time.sleep(3) # each click on the site opens a new window, so here we are switching windows and then closing windows once data read. for _result in range(len(all_results)): result = all_results[_result].find_all('td') if not result: continue result_title = result[1].text.strip() print(page_num, " : ", ending_page, " : ", _result, " / ", len(all_results), " : ", result_title, " : ", total_results, " : ", ending_page) print(f"page_num: {page_num} Done!") page_num += 1 if page_num > ending_page: break next_button = soup.find('button', class_='btn-next').attrs if 'disabled' in next_button: break self.clicking('//button[@class="btn-next"]') time.sleep(3) # self.close_handles(second_handle, self.main_page_handle) if __name__ == '__main__': REY_NUM = 5 next_year = datetime.now().year + 1 url = r'https://www.nmpa.gov.cn/datasearch/search-result.html' # with Display(visible=0, size=(1920, 1080)) as display: for _ in range(REY_NUM): try: handle = INTERFACING() soup = handle.get_selenium_response(url) handle.skip_button("introjs-skipbutton") soup = handle.make_soup() if soup.find('div', class_='header-main') is None: print("访问失败!") main_page_handle = handle.get_current_handle() count = 0 while 1: soup = handle.make_soup() try: handle.clicking('//span[text()="Medical Devices"]') break except Exception as error: print('Medical button not yet visible') try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr',class_=re.compile('el-table__row')) break except: pass response = handle.skip_button('introjs-skipbutton') if response: break time.sleep(1) count += 1 if count >= 5: break handle.clicking("//*[@class='pc-max el-row']/div/a[@title='一次性使用医疗器械产品']") for device_type in ["械备", "注进", "注准"]: for year in range(2020, 2022): current_query = f'{device_type}{year}' handle.search_data(current_query,1) # print(f'{device_type}{_year}') except Exception as e: print(f'爬取NMPADisposableProductsRequester数据失败: 详情{e}') handle.close_driver() time.sleep(60) else: raise Exception( f"已经重试{REY_NUM}次, 爬取NMPADisposableProductsRequester数据失败, 详情{e}")
跳过弹窗 handle.skip_button("introjs-skipbutton")
之后
class INTERFACING():
def __init__(self): self.driver_initialized = False self.driver = '' self.MAX_TRIALS = 2 # self.chrome_version = get_google_chrome_version()
def make_soup(self): return BeautifulSoup(self.driver.page_source, 'lxml') # etree.HTML()
def current_url(self): return self.driver.current_url
def get_driver(self):
# uc.TARGET_VERSION = get_google_chrome_version() chrome_options = uc.ChromeOptions()
# chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920.,1080") chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-popup-blocking") chrome_options.add_argument("--profile-directory=Default") chrome_options.add_argument("--ignore-certificate-errors") chrome_options.add_argument("--disable-plugins-discovery") chrome_options.add_argument("--incognito") chrome_options.add_argument("--no-first-run") chrome_options.add_argument("--no-service-autorun") chrome_options.add_argument("--no-default-browser-check") chrome_options.add_argument("--password-store=basic") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument('--disable-application-cache') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument("--disable-setuid-sandbox") chrome_options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" ) self.driver = uc.Chrome(options=chrome_options, version_main="113") # self.browser = uc.Chrome(options=chrome_options, version_main=113) time.sleep(10) self.driver_initialized = True
def close_driver(self): self.driver.quit()
def get_selenium_response(self, url):
# try: if not self.driver_initialized: self.get_driver() else: pass self.driver.get(url) time.sleep(3) soup = self.make_soup() return soup
def get_page_source(self): return self.driver.page_source
def clicking(self, xpath): elem = self.driver.find_element(By.XPATH, xpath) elem.click() time.sleep(random.randint(2, 3))
def entering_values(self, xpath, value): elem = self.driver.find_element(By.XPATH, xpath) elem.clear() elem.send_keys(value) time.sleep(random.randint(2, 4))
def send_keys(self, xpath): elem = self.driver.find_element(By.XPATH, xpath).send_keys(Keys.RETURN)
def going_back(self): self.driver.execute_script("window.history.go(-1)") time.sleep(1)
def refresh_page(self): self.driver.refresh()
def close_handle(self): self.driver.close()
def get_current_handle(self): return self.driver.current_window_handle
def get_all_handles(self): return self.driver.window_handles
def swtich_to_window(self, handle): self.driver.switch_to.window(handle)
def switch_handle(self, second_handle=''):
all_handles = self.get_all_handles() for handle in all_handles: self.main_page_handle = self.get_current_handle() if handle == self.main_page_handle: continue
if second_handle and handle == second_handle: continue
self.swtich_to_window(handle)
return handle
def close_handles(self, page_handle, second_handle):
all_handles = self.get_all_handles()
for handle in all_handles: if handle == page_handle: try: self.close_handle() except: pass
self.swtich_to_window(second_handle)
def skip_button(self, class_item): count = 0 while 1:
soup = self.make_soup()
try: self.clicking(f'//a[contains(@class,"{class_item}")]') break except Exception as error: print('skip button not yet visible')
if count > 3: try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) return True except: if soup.find('span', class_='el-table__empty-text') is not None: return True try: self.clicking('//span[text()="Medical Devices"]') break except: pass
time.sleep(2)
count += 1
if count == 20: break
def search_data(self, current_query,page_num): if page_num == 1: self.entering_values('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/input',current_query) self.clicking('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/div/button') second_handle = self.switch_handle() self.skip_button('introjs-nextbutton') self.skip_button("introjs-skipbutton") soup = self.make_soup()
if soup.find('span', class_='el-table__empty-text') is not None: pass else: print('Selecting 20 per page...') count = 0 while True: soup = self.make_soup() try: page_selector = soup.find('input', class_='el-input__inner') if page_selector.attrs.get("placeholder"): break except Exception as error: print('Record not yet loaded: ', count) time.sleep(3) count += 1 if not count % 3: print('page refreshed....') self.refresh_page() if count >= 51: break self.clicking('//*[@id="home"]/div[3]/div[3]/div/div/span[2]/div/div[1]/input') self.clicking( '//ul[@class="el-scrollbar__view el-select-dropdown__list"]//span[text()="20条/页"]') if page_num != 1: while 1: try: self.entering_values('//input[@type="number"]',page_num) break except: print('error in entering page num')
time.sleep(3) self.send_keys('//input[@type="number"]') time.sleep(3) while 1: soup = self.make_soup()
try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) except: if soup.find('span', class_='el-table__empty-text') is not None: print('No Results...') all_results = []
total_results = int(soup.find('span', class_='el-pagination__total').text.strip().split()[1]) ending_page = total_results // 20 + 1
while 1:
# sometimes it takes long to load all the records on the page, so here making sure we loaded all 20 records or # if not then making usre it's the last page soup = self.make_soup() all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile( 'el-table__row')) if len(all_results) == 20: break if len(all_results) < 20 and ending_page == page_num: break print(all_results, " : ", total_results, " : ", ending_page) time.sleep(3)
# each click on the site opens a new window, so here we are switching windows and then closing windows once data read. for _result in range(len(all_results)): result = all_results[_result].find_all('td') if not result: continue
result_title = result[1].text.strip()
print(page_num, " : ", ending_page, " : ", _result, " / ", len(all_results), " : ", result_title, " : ", total_results, " : ", ending_page)
print(f"page_num: {page_num} Done!") page_num += 1 if page_num > ending_page: break next_button = soup.find('button', class_='btn-next').attrs if 'disabled' in next_button: break self.clicking('//button[@class="btn-next"]') time.sleep(3) # self.close_handles(second_handle, self.main_page_handle)
if __name__ == '__main__': REY_NUM = 5 next_year = datetime.now().year + 1 url = r'https://www.nmpa.gov.cn/datasearch/search-result.html' # with Display(visible=0, size=(1920, 1080)) as display: for _ in range(REY_NUM): try: handle = INTERFACING() soup = handle.get_selenium_response(url) handle.skip_button("introjs-skipbutton")
soup = handle.make_soup()
if soup.find('div', class_='header-main') is None: print("访问失败!")
main_page_handle = handle.get_current_handle()
count = 0 while 1: soup = handle.make_soup()
try: handle.clicking('//span[text()="Medical Devices"]') break except Exception as error: print('Medical button not yet visible')
try: all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr',class_=re.compile('el-table__row')) break except: pass
response = handle.skip_button('introjs-skipbutton')
if response: break
time.sleep(1)
count += 1
if count >= 5: break handle.clicking("//*[@class='pc-max el-row']/div/a[@title='一次性使用医疗器械产品']") for device_type in ["械备", "注进", "注准"]: for year in range(2020, 2022): current_query = f'{device_type}{year}' handle.search_data(current_query,1) # print(f'{device_type}{_year}')
except Exception as e: print(f'爬取NMPADisposableProductsRequester数据失败: 详情{e}')
handle.close_driver() time.sleep(60) else: raise Exception( f"已经重试{REY_NUM}次, 爬取NMPADisposableProductsRequester数据失败, 详情{e}")