python实现简单得google搜索
import requests from bs4 import BeautifulSoup from fake_useragent import UserAgent class GoogleSpider: def __init__(self, **kwargs): self.keyword = kwargs.get("keyword") def __del__(self): pass def search(self, **kwargs) -> list: data = [] if kwargs.get("keyword") is None: if self.keyword is None: return [] else: query = self.keyword else: query = kwargs.get("keyword") query = query.replace(' ', '+') # URL = f"http://google.com/search?q={query}" page = 0 while True: # URL = f"https://www.google.com.hk/search?q={query}&newwindow=1&ei=l51XYufsEJX09APssZboDg&start={page * 10}&sa=N&ved=2ahUKEwinlJbD1pL3AhUVOn0KHeyYBe0Q8tMDegQIAhA1&biw=1536&bih=370&dpr=1.25" URL="https://www.google.com.hk/search?q={query}&newwindow=1&ei=pbdXYtL9FNW-0PEPv96DiA0&start={page * 10}&sa=N&ved=2ahUKEwiS5Nqv75L3AhVVHzQIHT_vANEQ8tMDegQIARA1&biw=1536&bih=396&dpr=1.25" try: print("当前正在搜索【" + str(query) + "】,当前第" + str(page) + "页...") USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36" headers = { # "user-agent": USER_AGENT, 'User-Agent': str(UserAgent(path="ua.json").random), "cookie": "CONSENT=YES+srp.gws-20211208-0-RC2.zh-CN+FX+870; " "AEC=AVQQ_LBBv2AdMIJg5Mo-mhbpPvz7Yy6TXL2YDpPEIWPZ2V12AZNvVRj01w; 1P_JAR=2022-04-14-04; " "NID=511" "=WG_TSuY8P75PO_IIAjeBJh4D9Z1peKXWPh22PDAN62GWAajB5gIj6tvOQRCjHX5g9PEJPyM2RDB_ZlT5qS3lSXhUpOA1U9KkBkt3UbLM6uoHIZubQoHzZMzstsr_e_8eMDo9LPs18nMvIJf-4C6F_XC6TvZCYmgER4Dt2YzXRu6DhCoDljBI46qarDZiCTFDKvy2PNp_hzrGTfOUqg; DV=I3h3GDVGyQsvcNiZldgA7vxYqO5jAlg4dyRxmh2zaAEAAAA ", } resp = requests.get(URL, headers=headers, verify=True) # print(resp.content) f = open("1.html", "wb+") f.write(resp.content) f.close() if resp.status_code == 200: soup = BeautifulSoup(resp.content, "html.parser") # print(soup.prettify()) li_arr = soup.select("div[class='yuRUbf']") if len(li_arr): print(len(li_arr)) # arr = [] for key in li_arr: li_a = key.select("a") a_href = li_a[0].attrs["href"] li_h3 = li_a[0].select("h3") _title = li_h3[0].text.strip().strip("\n").strip().replace("\n", "").replace(" ", "") print(_title) print(a_href) obj = {"company": query, "title": _title, "url": a_href} print(obj) # arr.append(obj) page += 1 else: break except Exception as e: print(e) break return data if __name__ == "__main__": gs = GoogleSpider() keyword = "python" data = gs.search(keyword=keyword)