class INTERFACING():

    def __init__(self):
        self.driver_initialized = False
        self.driver = ''
        self.MAX_TRIALS = 2
        # self.chrome_version = get_google_chrome_version()

    def make_soup(self):
        return BeautifulSoup(self.driver.page_source, 'lxml')  # etree.HTML()

    def current_url(self):
        return self.driver.current_url

    def get_driver(self):

        # uc.TARGET_VERSION = get_google_chrome_version()
        chrome_options = uc.ChromeOptions()

        # chrome_options.add_argument("--headless")
        chrome_options.add_argument("--window-size=1920.,1080")
        chrome_options.add_argument("--disable-extensions")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-popup-blocking")
        chrome_options.add_argument("--profile-directory=Default")
        chrome_options.add_argument("--ignore-certificate-errors")
        chrome_options.add_argument("--disable-plugins-discovery")
        chrome_options.add_argument("--incognito")
        chrome_options.add_argument("--no-first-run")
        chrome_options.add_argument("--no-service-autorun")
        chrome_options.add_argument("--no-default-browser-check")
        chrome_options.add_argument("--password-store=basic")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument('--disable-application-cache')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument("--disable-setuid-sandbox")
        chrome_options.add_argument(
            "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
        )
        self.driver = uc.Chrome(options=chrome_options, version_main="113")
        # self.browser = uc.Chrome(options=chrome_options, version_main=113)
        time.sleep(10)
        self.driver_initialized = True

    def close_driver(self):
        self.driver.quit()

    def get_selenium_response(self, url):

        # try:
        if not self.driver_initialized:
            self.get_driver()
        else:
            pass
        self.driver.get(url)
        time.sleep(3)
        soup = self.make_soup()
        return soup

    def get_page_source(self):
        return self.driver.page_source

    def clicking(self, xpath):
        elem = self.driver.find_element(By.XPATH, xpath)
        elem.click()
        time.sleep(random.randint(2, 3))

    def entering_values(self, xpath, value):
        elem = self.driver.find_element(By.XPATH, xpath)
        elem.clear()
        elem.send_keys(value)
        time.sleep(random.randint(2, 4))

    def send_keys(self, xpath):
        elem = self.driver.find_element(By.XPATH, xpath).send_keys(Keys.RETURN)

    def going_back(self):
        self.driver.execute_script("window.history.go(-1)")
        time.sleep(1)

    def refresh_page(self):
        self.driver.refresh()

    def close_handle(self):
        self.driver.close()

    def get_current_handle(self):
        return self.driver.current_window_handle

    def get_all_handles(self):
        return self.driver.window_handles

    def swtich_to_window(self, handle):
        self.driver.switch_to.window(handle)

    def switch_handle(self, second_handle=''):

        all_handles = self.get_all_handles()
        for handle in all_handles:
            self.main_page_handle = self.get_current_handle()
            if handle == self.main_page_handle:
                continue

            if second_handle and handle == second_handle:
                continue

            self.swtich_to_window(handle)

            return handle

    def close_handles(self, page_handle, second_handle):

        all_handles = self.get_all_handles()

        for handle in all_handles:
            if handle == page_handle:
                try:
                    self.close_handle()
                except:
                    pass

        self.swtich_to_window(second_handle)

    def skip_button(self, class_item):
        count = 0
        while 1:

            soup = self.make_soup()

            try:
                self.clicking(f'//a[contains(@class,"{class_item}")]')
                break
            except Exception as error:
                print('skip button not yet visible')

            if count > 3:
                try:
                    all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(
                        'el-table__row'))
                    return True
                except:
                    if soup.find('span', class_='el-table__empty-text') is not None:
                        return True
            try:
                self.clicking('//span[text()="Medical Devices"]')
                break
            except:
                pass

            time.sleep(2)

            count += 1

            if count == 20:
                break




    def search_data(self, current_query,page_num):
        if page_num == 1:
            self.entering_values('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/input',current_query)
            self.clicking('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/div/button')
            second_handle = self.switch_handle()
            self.skip_button('introjs-nextbutton')
            self.skip_button("introjs-skipbutton")
            soup = self.make_soup()

            if soup.find('span', class_='el-table__empty-text') is not None:
                pass
            else:
                print('Selecting 20 per page...')
                count = 0
                while True:
                    soup = self.make_soup()
                    try:
                        page_selector = soup.find('input', class_='el-input__inner')
                        if page_selector.attrs.get("placeholder"):
                            break
                    except Exception as error:
                        print('Record not yet loaded: ', count)
                    time.sleep(3)
                    count += 1
                    if not count % 3:
                        print('page refreshed....')
                        self.refresh_page()
                    if count >= 51:
                        break
                self.clicking('//*[@id="home"]/div[3]/div[3]/div/div/span[2]/div/div[1]/input')
                self.clicking(
                    '//ul[@class="el-scrollbar__view el-select-dropdown__list"]//span[text()="20条/页"]')
        if page_num != 1:
            while 1:
                try:
                    self.entering_values('//input[@type="number"]',page_num)
                    break
                except:
                    print('error in entering page num')

                time.sleep(3)
            self.send_keys('//input[@type="number"]')
            time.sleep(3)
        while 1:
            soup = self.make_soup()

            try:
                all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(
                    'el-table__row'))
            except:
                if soup.find('span', class_='el-table__empty-text') is not None:
                    print('No Results...')
                    all_results = []

            total_results = int(soup.find('span', class_='el-pagination__total').text.strip().split()[1])
            ending_page = total_results // 20 + 1

            while 1:

                # sometimes it takes long to load all the records on the page, so here making sure we loaded all 20 records or
                # if not then making usre it's the last page
                soup = self.make_soup()
                all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(
                    'el-table__row'))
                if len(all_results) == 20:
                    break
                if len(all_results) < 20 and ending_page == page_num:
                    break
                print(all_results, " : ", total_results, " : ", ending_page)
                time.sleep(3)

            # each click on the site opens a new window, so here we are switching windows and then closing windows once data read.
            for _result in range(len(all_results)):
                result = all_results[_result].find_all('td')
                if not result:
                    continue

                result_title = result[1].text.strip()

                print(page_num, " : ", ending_page, " : ", _result, " / ", len(all_results), " : ",
                      result_title, " : ", total_results, " : ", ending_page)

            print(f"page_num: {page_num} Done!")
            page_num += 1
            if page_num > ending_page:
                break
            next_button = soup.find('button', class_='btn-next').attrs
            if 'disabled' in next_button:
                break
            self.clicking('//button[@class="btn-next"]')
            time.sleep(3)
        # self.close_handles(second_handle, self.main_page_handle)




if __name__ == '__main__':
    REY_NUM = 5
    next_year = datetime.now().year + 1
    url = r'https://www.nmpa.gov.cn/datasearch/search-result.html'
    # with Display(visible=0, size=(1920, 1080)) as display:
    for _ in range(REY_NUM):
        try:
            handle = INTERFACING()
            soup = handle.get_selenium_response(url)
            handle.skip_button("introjs-skipbutton")

            soup = handle.make_soup()

            if soup.find('div', class_='header-main') is None:
                print("访问失败!")

            main_page_handle = handle.get_current_handle()

            count = 0
            while 1:
                soup = handle.make_soup()

                try:
                    handle.clicking('//span[text()="Medical Devices"]')
                    break
                except Exception as error:
                    print('Medical button not yet visible')

                try:
                    all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr',class_=re.compile('el-table__row'))
                    break
                except:
                    pass

                response = handle.skip_button('introjs-skipbutton')

                if response:
                    break

                time.sleep(1)

                count += 1

                if count >= 5:
                    break
            handle.clicking("//*[@class='pc-max el-row']/div/a[@title='一次性使用医疗器械产品']")
            for device_type in ["械备", "注进", "注准"]:
                for year in range(2020, 2022):
                    current_query = f'{device_type}{year}'
                    handle.search_data(current_query,1)
                    # print(f'{device_type}{_year}')



        except Exception as e:
            print(f'爬取NMPADisposableProductsRequester数据失败: 详情{e}')

        handle.close_driver()
        time.sleep(60)
    else:
        raise Exception(
            f"已经重试{REY_NUM}次, 爬取NMPADisposableProductsRequester数据失败, 详情{e}")

  跳过弹窗  handle.skip_button("introjs-skipbutton")

 之后

 

 

class INTERFACING():
    def __init__(self):        self.driver_initialized = False        self.driver = ''        self.MAX_TRIALS = 2        # self.chrome_version = get_google_chrome_version()
    def make_soup(self):        return BeautifulSoup(self.driver.page_source, 'lxml')  # etree.HTML()
    def current_url(self):        return self.driver.current_url
    def get_driver(self):
        # uc.TARGET_VERSION = get_google_chrome_version()        chrome_options = uc.ChromeOptions()
        # chrome_options.add_argument("--headless")        chrome_options.add_argument("--window-size=1920.,1080")        chrome_options.add_argument("--disable-extensions")        chrome_options.add_argument("--disable-dev-shm-usage")        chrome_options.add_argument("--disable-popup-blocking")        chrome_options.add_argument("--profile-directory=Default")        chrome_options.add_argument("--ignore-certificate-errors")        chrome_options.add_argument("--disable-plugins-discovery")        chrome_options.add_argument("--incognito")        chrome_options.add_argument("--no-first-run")        chrome_options.add_argument("--no-service-autorun")        chrome_options.add_argument("--no-default-browser-check")        chrome_options.add_argument("--password-store=basic")        chrome_options.add_argument("--no-sandbox")        chrome_options.add_argument('--disable-application-cache')        chrome_options.add_argument('--disable-gpu')        chrome_options.add_argument("--disable-setuid-sandbox")        chrome_options.add_argument(            "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"        )        self.driver = uc.Chrome(options=chrome_options, version_main="113")        # self.browser = uc.Chrome(options=chrome_options, version_main=113)        time.sleep(10)        self.driver_initialized = True
    def close_driver(self):        self.driver.quit()
    def get_selenium_response(self, url):
        # try:        if not self.driver_initialized:            self.get_driver()        else:            pass        self.driver.get(url)        time.sleep(3)        soup = self.make_soup()        return soup
    def get_page_source(self):        return self.driver.page_source
    def clicking(self, xpath):        elem = self.driver.find_element(By.XPATH, xpath)        elem.click()        time.sleep(random.randint(2, 3))
    def entering_values(self, xpath, value):        elem = self.driver.find_element(By.XPATH, xpath)        elem.clear()        elem.send_keys(value)        time.sleep(random.randint(2, 4))
    def send_keys(self, xpath):        elem = self.driver.find_element(By.XPATH, xpath).send_keys(Keys.RETURN)
    def going_back(self):        self.driver.execute_script("window.history.go(-1)")        time.sleep(1)
    def refresh_page(self):        self.driver.refresh()
    def close_handle(self):        self.driver.close()
    def get_current_handle(self):        return self.driver.current_window_handle
    def get_all_handles(self):        return self.driver.window_handles
    def swtich_to_window(self, handle):        self.driver.switch_to.window(handle)
    def switch_handle(self, second_handle=''):
        all_handles = self.get_all_handles()        for handle in all_handles:            self.main_page_handle = self.get_current_handle()            if handle == self.main_page_handle:                continue
            if second_handle and handle == second_handle:                continue
            self.swtich_to_window(handle)
            return handle
    def close_handles(self, page_handle, second_handle):
        all_handles = self.get_all_handles()
        for handle in all_handles:            if handle == page_handle:                try:                    self.close_handle()                except:                    pass
        self.swtich_to_window(second_handle)
    def skip_button(self, class_item):        count = 0        while 1:
            soup = self.make_soup()
            try:                self.clicking(f'//a[contains(@class,"{class_item}")]')                break            except Exception as error:                print('skip button not yet visible')
            if count > 3:                try:                    all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(                        'el-table__row'))                    return True                except:                    if soup.find('span', class_='el-table__empty-text') is not None:                        return True            try:                self.clicking('//span[text()="Medical Devices"]')                break            except:                pass
            time.sleep(2)
            count += 1
            if count == 20:                break



    def search_data(self, current_query,page_num):        if page_num == 1:            self.entering_values('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/input',current_query)            self.clicking('//*[@id="home"]/main/div[1]/div[7]/div/div[2]/div/button')            second_handle = self.switch_handle()            self.skip_button('introjs-nextbutton')            self.skip_button("introjs-skipbutton")            soup = self.make_soup()
            if soup.find('span', class_='el-table__empty-text') is not None:                pass            else:                print('Selecting 20 per page...')                count = 0                while True:                    soup = self.make_soup()                    try:                        page_selector = soup.find('input', class_='el-input__inner')                        if page_selector.attrs.get("placeholder"):                            break                    except Exception as error:                        print('Record not yet loaded: ', count)                    time.sleep(3)                    count += 1                    if not count % 3:                        print('page refreshed....')                        self.refresh_page()                    if count >= 51:                        break                self.clicking('//*[@id="home"]/div[3]/div[3]/div/div/span[2]/div/div[1]/input')                self.clicking(                    '//ul[@class="el-scrollbar__view el-select-dropdown__list"]//span[text()="20条/页"]')        if page_num != 1:            while 1:                try:                    self.entering_values('//input[@type="number"]',page_num)                    break                except:                    print('error in entering page num')
                time.sleep(3)            self.send_keys('//input[@type="number"]')            time.sleep(3)        while 1:            soup = self.make_soup()
            try:                all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(                    'el-table__row'))            except:                if soup.find('span', class_='el-table__empty-text') is not None:                    print('No Results...')                    all_results = []
            total_results = int(soup.find('span', class_='el-pagination__total').text.strip().split()[1])            ending_page = total_results // 20 + 1
            while 1:
                # sometimes it takes long to load all the records on the page, so here making sure we loaded all 20 records or                # if not then making usre it's the last page                soup = self.make_soup()                all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr', class_=re.compile(                    'el-table__row'))                if len(all_results) == 20:                    break                if len(all_results) < 20 and ending_page == page_num:                    break                print(all_results, " : ", total_results, " : ", ending_page)                time.sleep(3)
            # each click on the site opens a new window, so here we are switching windows and then closing windows once data read.            for _result in range(len(all_results)):                result = all_results[_result].find_all('td')                if not result:                    continue
                result_title = result[1].text.strip()
                print(page_num, " : ", ending_page, " : ", _result, " / ", len(all_results), " : ",                      result_title, " : ", total_results, " : ", ending_page)
            print(f"page_num: {page_num} Done!")            page_num += 1            if page_num > ending_page:                break            next_button = soup.find('button', class_='btn-next').attrs            if 'disabled' in next_button:                break            self.clicking('//button[@class="btn-next"]')            time.sleep(3)        # self.close_handles(second_handle, self.main_page_handle)



if __name__ == '__main__':    REY_NUM = 5    next_year = datetime.now().year + 1    url = r'https://www.nmpa.gov.cn/datasearch/search-result.html'    # with Display(visible=0, size=(1920, 1080)) as display:    for _ in range(REY_NUM):        try:            handle = INTERFACING()            soup = handle.get_selenium_response(url)            handle.skip_button("introjs-skipbutton")
            soup = handle.make_soup()
            if soup.find('div', class_='header-main') is None:                print("访问失败!")
            main_page_handle = handle.get_current_handle()
            count = 0            while 1:                soup = handle.make_soup()
                try:                    handle.clicking('//span[text()="Medical Devices"]')                    break                except Exception as error:                    print('Medical button not yet visible')
                try:                    all_results = soup.find('table', class_='el-table__body').tbody.find_all('tr',class_=re.compile('el-table__row'))                    break                except:                    pass
                response = handle.skip_button('introjs-skipbutton')
                if response:                    break
                time.sleep(1)
                count += 1
                if count >= 5:                    break            handle.clicking("//*[@class='pc-max el-row']/div/a[@title='一次性使用医疗器械产品']")            for device_type in ["械备", "注进", "注准"]:                for year in range(2020, 2022):                    current_query = f'{device_type}{year}'                    handle.search_data(current_query,1)                    # print(f'{device_type}{_year}')


        except Exception as e:            print(f'爬取NMPADisposableProductsRequester数据失败: 详情{e}')
        handle.close_driver()        time.sleep(60)    else:        raise Exception(            f"已经重试{REY_NUM}次, 爬取NMPADisposableProductsRequester数据失败, 详情{e}")