def get_search_github(keyword, language, pageIndex): params = { "q" : keyword, "type": "Repositories", "l": language, "p": pageIndex } p = urllib.parse.urlencode(params) url = r"https://github.com/search?" + p return url def get_github_source_stars(url): html= requests.get(url) response = html.content.decode("utf-8") getDataPattern = r'repo-list-item[\s|\S]*?mt-n1[\s|\S]*?f4 text-normal">[\s|\S]*?>([\s|\S]*?)</a>[\s|\S]*?mb-1">([\s|\S]+?)</p>[\s|\S]*?octicon octicon-star[\s|\S]*?</path></svg>\s+(\S+)[\s|\S]*?programmingLanguage">(\S+)</span>' resultList = re.findall(getDataPattern, response) SourceList = [] for item in resultList: star = str(item[2]).lstrip() k = False star_count = star if (star.lower().endswith("k")): k = True star_count = star.lower().replace("k", "").lstrip() star_count = float(star_count) if k: star_count *= 1000 sItem = {"title": str(item[0]).lstrip(), "language": str(item[3]).lstrip(), "description": str(item[1]).lstrip(), "stars": star, "start_count": int(star_count) } SourceList.append(sItem) return SourceList
for page in range(1, 15): url = get_search_github(r"stars:>10000", "python", page) alist = get_github_source_stars(url)