python+selenium+xpath 爬取天眼查工商基本信息


# -*- coding:utf-8 -*-
# author: kevin
# CreateTime: 2018/8/16
# software-version: python 3.7


import time
from selenium import webdriver
from selenium.webdriver import Firefox
import os

class GetCompanyInfo(object):
    """
    爬取天眼查下的企业的信息
    """
    def __init__(self):
        """
        初始化爬虫执行代理，使用firefox访问
        """
        self.username = ''
        self.password = ''
        self.options = webdriver.FirefoxOptions()
        self.options.add_argument('-headless')  # 无头参数
        self.geckodriver = r'geckodriver'
        self.driver = Firefox(executable_path=self.geckodriver, firefox_options=self.options)

        self.start_url = 'https://www.tianyancha.com'

    def login(self):
        """
        登录并检查状态
        :return:
        """
        # try:
        self.driver.get(self.start_url)

        #print(self.driver.get_cookies())

        username = self.index_login()
        username_pattern = username[:3] + ' **** ' + username[-4:]
        print(username_pattern)
        page = self.driver.page_source
        #print(page)
        is_login = page.find(username_pattern)

        if is_login != -1:
            print('登录成功')

    def index_login(self):
        """
        主页下的登录模式
        :return:
        """
        get_login = self.driver.find_elements_by_xpath('//a[@class="link-white"]')[0]   # 登录/注册
        print(get_login.text)
        # url为login的input
        get_login.click()
        login_by_pwd = self.driver.find_element_by_xpath('//div[@class="bgContent"]/div[3]/div[2]/div')     # 切换到手机登录
        login_by_pwd.click()
        input1 = self.driver.find_element_by_xpath('//div[@class="bgContent"]/div[3]/div[1]/div[2]/input')     # 手机号码
        input2 = self.driver.find_element_by_xpath('//div[@class="bgContent"]/div[3]/div[1]/div[3]/input')     # 密码
        #print(input1.get_attribute('placeholder'))
        #print(input2.get_attribute('placeholder'))
        username, password = self._check_user_pass()
        input1.send_keys(username)
        input2.send_keys(password)

        login_button = self.driver.find_element_by_xpath('//div[@class="bgContent"]/div[3]/div[1]/div[5]')     # 点击登录
        #print(login_button.text)
        time.sleep(1)   # 必须等待否则鉴别是爬虫
        login_button.click()
        return username

    def _check_user_pass(self):
        """
        检查是否有帐号密码
        :return:
        """
        if self.username and self.password:
            return self.username, self.password
        else:
            #username = input('输入您的手机号码\n')
            #password = input('输入您的密码\n')
            username = '***'  #注册手机号
            password = '***'  #密码
            return username, password

    def login_page_login(self):
        """
        url：www.tianyancha.com/login
        在这个url下的登录模式
        :return:
        """
        input1 = self.driver.find_element_by_xpath('//div[contains(@class,"in-block")'
                                                   ' and contains(@class, "vertical-top")'
                                                   ' and contains(@class, "float-right")'
                                                   ' and contains(@class, "right_content")'
                                                   ' and contains(@class, "mt50")'
                                                   ' and contains(@class, "mr5")'
                                                   ' and contains(@class, "mb5")'
                                                   ']/div[2]/div[2]/div[2]/input')

        input2 = self.driver.find_element_by_xpath('//div[contains(@class,"in-block")'
                                                   ' and contains(@class, "vertical-top")'
                                                   ' and contains(@class, "float-right")'
                                                   ' and contains(@class, "right_content")'
                                                   ' and contains(@class, "mt50")'
                                                   ' and contains(@class, "mr5")'
                                                   ' and contains(@class, "mb5")'
                                                   ']/div[2]/div[2]/div[3]/input')
        print(input1.get_attribute('placeholder'))
        input1.send_keys("")
        print(input2.get_attribute('placeholder'))
        input2.send_keys('')

        login_button = self.driver.find_element_by_xpath('//div[contains(@class,"in-block")'
                                                         ' and contains(@class, "vertical-top")'
                                                         ' and contains(@class, "float-right")'
                                                         ' and contains(@class, "right_content")'
                                                         ' and contains(@class, "mt50")'
                                                         ' and contains(@class, "mr5")'
                                                         ' and contains(@class, "mb5")'
                                                         ']/div[2]/div[2]/div[5]')

        #print(login_button.text)
        time.sleep(1)
        login_button.click()

    def get_company_info(self, company_name):
        """
        获取想要的公司信息
        :param company_name:
        :param company_onwer:
        :return:
        """
        time.sleep(1)
        page = self.driver.page_source

        index_input_company = self.driver.find_element_by_xpath('//input[@id="home-main-search"]')  # 主页搜索框

        index_input_company.send_keys(company_name)
        self.driver.find_element_by_xpath('//div[contains(@class, "input-group-btn")'
                                          ' and contains(@class, "btn")'
                                          ' and contains(@class, "-hg")'
                                          '][1]').click()  # 点击搜索

        #page = self.driver.page_source
        #print (page)

        try:
            company_list = self.driver.find_element_by_xpath('.//div[contains(@class, "header")][1]/a[1]') # 获取当前页面所有公司的div
            href = company_list.get_attribute('href')
            self.driver.get(href)  # 进入公司详情页
            company = self.save_company_info()
            return company
        except:
            print('没有搜索到相应的公司')
            return company_name

    def save_company_info(self):

        #a = self.driver.find_element_by_xpath('.//div[contains(@class, "nav-item-list list list-hover-show  nav-line")][1]/a[1]/span[1]')
        #print ('___显示手机号已是登录状态____' + a.text)

        company = ''
        # 公司名
        company_name = self.driver.find_element_by_xpath('.//div[contains(@class, "header")][1]/h1')
        company += '公司名：' + company_name.text + '\n'
        # 电话
        phone = self.driver.find_element_by_xpath('.//div[contains(@class, "detail ")][1]/div[1]/div[1]/span[2]')
        company += '电话：' + phone.text + '\n'
        # 邮箱
        email = self.driver.find_element_by_xpath('.//div[contains(@class, "detail ")][1]/div[1]/div[2]/span[2]')
        company += 'email：' + email.text + '\n'
        #e = self.driver.find_element_by_xpath('.//div[@class="detail "][1]/div[1]/div[2]/span[3]/span[1]')
        # 获取更多邮箱
        #email = self.get_email(e)
        # 网址
        try:
            link = self.driver.find_element_by_xpath('.//div[contains(@class, "detail ")][1]/div[2]/div[1]/a[1]')
        except:
            link = self.driver.find_element_by_xpath('.//div[contains(@class, "detail ")][1]/div[2]/div[1]/span[2]')
        company += '网址：' + link.text + '\n'
        print(link.text)
        # 地址
        try:
            ad = self.driver.find_element_by_xpath('.//div[contains(@class, "detail ")][1]/div[2]/div[2]/span[2]')
            address = ad.get_attribute('title')
        except:
            ad = self.driver.find_element_by_xpath('.//div[contains(@class, "detail ")][1]/div[2]/div[2]')
            address = ad.text
        company += '地址：' + address + '\n'
        # 简介
        try:
            s = self.driver.find_element_by_xpath('.//div[contains(@class, "summary")][1]/span[3]')
            summary = self.get_summary(s)
        except:
            summary = self.driver.find_element_by_xpath('.//div[contains(@class, "summary")][1]/span[2]')
        company += '简介：' + summary.text + '\n'
        #法定代表人
        company += '法定代表人 '
        # 姓名
        name = self.driver.find_element_by_xpath('.//div[contains(@class, "humancompany")][1]/div[1]/a[1]')
        company  += '姓名：' + name.text + '\n'
        # 公司数
        company_num = self.driver.find_element_by_xpath('.//div[contains(@class, "humancompany")][1]/div[2]/span[1]')
        #company['company_num'] = '介绍：' + '他有' + company_num.text+'家公司分布如下'
        #介绍内容

        money = self.driver.find_element_by_xpath('.//div[contains(@id, "_container_baseInfo")][1]/table[1]/tbody[1]/tr[1]/td[2]/div[2]')
        money_reg = money.get_attribute('title')
        company += '注册资本' + money_reg + '\n'

        date_reg = self.driver.find_element_by_xpath('.//div[contains(@id, "_container_baseInfo")][1]/table[1]/tbody[1]/tr[2]/td[1]/div[2]/text[1]')
        company += '注册时间' + date_reg.text + '\n'

        cancel = self.driver.find_element_by_xpath('.//div[contains(@class, "num-cancel")][1]')
        company += '公司状态' + cancel.text + '\n'


        maincompany = self.driver.find_elements_by_xpath('.//div[contains(@class, "merge")]/div')
        maincompany_info = ''
        for com in maincompany:
            area = com.find_element_by_xpath('.//div[contains(@class, "title")][1]')
            company_detail = com.find_element_by_xpath('.//div[contains(@class, "maincompany")][1]/span[1]')
            maincompany_info += area.text + company_detail.text + ','
        company += maincompany_info + '\n'
        #工商注册号
        company_content = self.driver.find_elements_by_xpath('.//table[contains(@class, "table")'
                                                             ' and contains(@class, "-striped-col")'
                                                             ' and contains(@class, "-border-top-none")'
                                                             '][1]/tbody[1]/tr')
        business = ''
        for cont in company_content:
            company_title_1 = cont.find_element_by_xpath('.//td[1]')

            company_info_1 = cont.find_element_by_xpath('.//td[2]')
            try:
               company_title_2 = cont.find_element_by_xpath('.//td[3]')
            except:
                pass
            try:
                company_info_2 = cont.find_element_by_xpath('.//td[4]')
            except:
                pass

            business += company_title_1.text + '：' + company_info_1.text + ',' + company_title_2.text + '：' + company_info_2.text + '\n'

        company += business + '\n'
        #微信号码
        try:
            wechat = self.driver.find_element_by_xpath('.//div[contains(@class, "wechat")][1]/div[2]/div[2]/span[2]')
            company += '微信公众号：' + wechat.text + '\n'
        except:
            pass

        print(company)

        return company


        #print(self.driver.page_source)
    # 得到更多邮箱
    # def get_email(self,e):
    #     e.click()
    #     email = self.driver.find_element_by_xpath('.//div[@class="body  -detail modal-scroll text-center"][1]/div[1]/div')
    #
    #     return email

    # 得到简介详情
    def get_summary(self,e):
        e.click()
        summary = self.driver.find_element_by_xpath('.//div[@class="body -detail modal-scroll"][1]')

        return summary

    #得到
    def get_company_name(self):
        f = open('company_name.txt')
        lines = f.readlines()
        return lines

    def main(self):

        self.login()

        company_name = self.get_company_name()

        for cn in company_name:
            name = cn.split()
            #self.driver.get(self.start_url)
            self.driver.get(self.start_url)
            company = self.get_company_info(name[0])
            if company == name[0]:
                res = company + '没有相关信息' + '\n'
            else:
                res = company

            write_file = os.getcwd() + '/company.txt'
            output = open(write_file, 'a')
            output.write(res)
            output.close()
        self.driver.close()


if __name__ == '__main__':

    # tt = GetCompanyInfo()
    # tt.test()
    time1 = time.time()
    new_crawl = GetCompanyInfo()
    new_crawl.main()
    time2 = time.time()
    print('用时：', int(time2-time1))
保证本地安装 geckodriver 
具体代码： 
链接:https://pan.baidu.com/s/14j00UXe5LRqBh0Hc3DIO1Q  密码:u42q
运行： python tianyancha.py
结果：
posted on 2018-08-27 10:00 程序小院阅读(5804) 评论(0) 收藏举报
刷新页面返回顶部
python+selenium+xpath 爬取天眼查工商基本信息

公告