不为别的,只为做一个连自己都羡慕的人

python实现简单得google搜索

import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


class GoogleSpider:
    def __init__(self, **kwargs):
        self.keyword = kwargs.get("keyword")

    def __del__(self):
        pass

    def search(self, **kwargs) -> list:
        data = []
        if kwargs.get("keyword") is None:
            if self.keyword is None:
                return []
            else:
                query = self.keyword
        else:
            query = kwargs.get("keyword")
        query = query.replace(' ', '+')
        # URL = f"http://google.com/search?q={query}"
        page = 0
        while True:
            # URL = f"https://www.google.com.hk/search?q={query}&newwindow=1&ei=l51XYufsEJX09APssZboDg&start={page * 10}&sa=N&ved=2ahUKEwinlJbD1pL3AhUVOn0KHeyYBe0Q8tMDegQIAhA1&biw=1536&bih=370&dpr=1.25"
            URL="https://www.google.com.hk/search?q={query}&newwindow=1&ei=pbdXYtL9FNW-0PEPv96DiA0&start={page * 10}&sa=N&ved=2ahUKEwiS5Nqv75L3AhVVHzQIHT_vANEQ8tMDegQIARA1&biw=1536&bih=396&dpr=1.25"
            try:
                print("当前正在搜索【" + str(query) + "】,当前第" + str(page) + "页...")
                USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
                headers = {
                    # "user-agent": USER_AGENT,
                    'User-Agent': str(UserAgent(path="ua.json").random),
                    "cookie": "CONSENT=YES+srp.gws-20211208-0-RC2.zh-CN+FX+870; "
                              "AEC=AVQQ_LBBv2AdMIJg5Mo-mhbpPvz7Yy6TXL2YDpPEIWPZ2V12AZNvVRj01w; 1P_JAR=2022-04-14-04; "
                              "NID=511"
                              "=WG_TSuY8P75PO_IIAjeBJh4D9Z1peKXWPh22PDAN62GWAajB5gIj6tvOQRCjHX5g9PEJPyM2RDB_ZlT5qS3lSXhUpOA1U9KkBkt3UbLM6uoHIZubQoHzZMzstsr_e_8eMDo9LPs18nMvIJf-4C6F_XC6TvZCYmgER4Dt2YzXRu6DhCoDljBI46qarDZiCTFDKvy2PNp_hzrGTfOUqg; DV=I3h3GDVGyQsvcNiZldgA7vxYqO5jAlg4dyRxmh2zaAEAAAA ",
                }

                resp = requests.get(URL, headers=headers, verify=True)
                # print(resp.content)
                f = open("1.html", "wb+")
                f.write(resp.content)
                f.close()
                if resp.status_code == 200:
                    soup = BeautifulSoup(resp.content, "html.parser")
                    # print(soup.prettify())
                    li_arr = soup.select("div[class='yuRUbf']")
                    if len(li_arr):
                        print(len(li_arr))
                        # arr = []
                        for key in li_arr:
                            li_a = key.select("a")
                            a_href = li_a[0].attrs["href"]
                            li_h3 = li_a[0].select("h3")
                            _title = li_h3[0].text.strip().strip("\n").strip().replace("\n", "").replace(" ", "")
                            print(_title)
                            print(a_href)
                            obj = {"company": query, "title": _title, "url": a_href}
                            print(obj)
                     
                            # arr.append(obj)
                        page += 1
                    else:
                        break
            except Exception as e:
                print(e)
                break
        return data


if __name__ == "__main__":
    gs = GoogleSpider()
    keyword = "python"
    data = gs.search(keyword=keyword)

 

posted @ 2022-04-15 10:40  升级打怪  阅读(375)  评论(0编辑  收藏  举报