抓取python所有包
Fetching Pypi data
This notebook fetches every package from the pypi server (~74000 packages!), extracts the setup.py and any file or folder with the word 'requirements' in it.
https://kgullikson88.github.io/blog/static/PyPi_Metadata.html
import xmlrpclib # only one api server so we'll use the deutschland mirror for downloading client = xmlrpclib.ServerProxy('http://pypi.python.org/pypi') packages = client.list_packages()
import tarfile, re, requests, csv, json
from base64 import b64encode
from kglib.utils.HelperFunctions import ensure_dir
def _save_file(pathname, member, tar_file):
try:
content = tar_file.extractfile(member).read()
except:
return
outfilename = '{}{}'.format(pathname, os.path.basename(member.name))
ensure_dir(outfilename)
with open(outfilename, 'w') as outfile:
outfile.write(content)
return
def _extract_files(package_file, name):
try:
tar_file = tarfile.open(fileobj=package_file)
except:
return
for member in tar_file.getmembers():
if 'setup.py' in member.name or 'requirements' in member.name:
_save_file(name, member, tar_file)
# content = tar_file.extractfile(member).read()
# with open('{}{}'.format(name, os.path.basename(member.name)), 'w') as outfile:
# outfile.write(content)
#elif 'requirements' in member.name:
# content = tar_file.extractfile(member).read()
# with open('{}{}'.format(name, os.path.basename(member.name)), 'w') as outfile:
# outfile.write(content)
def extract_package(name, client=xmlrpclib.ServerProxy('http://pypi.python.org/pypi')):
for release in client.package_releases(name):
outdir = 'packages/{}-{}/'.format(name, release)
doc = client.release_urls(name, release)
if doc:
url = None
for d in doc:
if d['python_version'] == 'source' and d['url'].endswith('gz'):
url = d['url']
if url:
#print(doc[3])
#url = doc[0].get('url')#.replace("http://pypi.python.org/", "http://f.pypi.python.org/")
#print "Downloading url %s" % url
req = requests.get(url)
if req.status_code != 200:
print "Could not download file %s" % req.status_code
else