123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302 |
- # -*- coding: utf-8 -*-
- #
- # Copyright (C) 2012-2015 Vinay Sajip.
- # Licensed to the Python Software Foundation under a contributor agreement.
- # See LICENSE.txt and CONTRIBUTORS.txt.
- #
- import gzip
- from io import BytesIO
- import json
- import logging
- import os
- import posixpath
- import re
- try:
- import threading
- except ImportError: # pragma: no cover
- import dummy_threading as threading
- import zlib
- from . import DistlibException
- from .compat import (urljoin, urlparse, urlunparse, url2pathname, pathname2url,
- queue, quote, unescape, string_types, build_opener,
- HTTPRedirectHandler as BaseRedirectHandler, text_type,
- Request, HTTPError, URLError)
- from .database import Distribution, DistributionPath, make_dist
- from .metadata import Metadata, MetadataInvalidError
- from .util import (cached_property, parse_credentials, ensure_slash,
- split_filename, get_project_data, parse_requirement,
- parse_name_and_version, ServerProxy, normalize_name)
- from .version import get_scheme, UnsupportedVersionError
- from .wheel import Wheel, is_compatible
- logger = logging.getLogger(__name__)
- HASHER_HASH = re.compile(r'^(\w+)=([a-f0-9]+)')
- CHARSET = re.compile(r';\s*charset\s*=\s*(.*)\s*$', re.I)
- HTML_CONTENT_TYPE = re.compile('text/html|application/x(ht)?ml')
- DEFAULT_INDEX = 'https://pypi.org/pypi'
- def get_all_distribution_names(url=None):
- """
- Return all distribution names known by an index.
- :param url: The URL of the index.
- :return: A list of all known distribution names.
- """
- if url is None:
- url = DEFAULT_INDEX
- client = ServerProxy(url, timeout=3.0)
- try:
- return client.list_packages()
- finally:
- client('close')()
- class RedirectHandler(BaseRedirectHandler):
- """
- A class to work around a bug in some Python 3.2.x releases.
- """
- # There's a bug in the base version for some 3.2.x
- # (e.g. 3.2.2 on Ubuntu Oneiric). If a Location header
- # returns e.g. /abc, it bails because it says the scheme ''
- # is bogus, when actually it should use the request's
- # URL for the scheme. See Python issue #13696.
- def http_error_302(self, req, fp, code, msg, headers):
- # Some servers (incorrectly) return multiple Location headers
- # (so probably same goes for URI). Use first header.
- newurl = None
- for key in ('location', 'uri'):
- if key in headers:
- newurl = headers[key]
- break
- if newurl is None: # pragma: no cover
- return
- urlparts = urlparse(newurl)
- if urlparts.scheme == '':
- newurl = urljoin(req.get_full_url(), newurl)
- if hasattr(headers, 'replace_header'):
- headers.replace_header(key, newurl)
- else:
- headers[key] = newurl
- return BaseRedirectHandler.http_error_302(self, req, fp, code, msg,
- headers)
- http_error_301 = http_error_303 = http_error_307 = http_error_302
- class Locator(object):
- """
- A base class for locators - things that locate distributions.
- """
- source_extensions = ('.tar.gz', '.tar.bz2', '.tar', '.zip', '.tgz', '.tbz')
- binary_extensions = ('.egg', '.exe', '.whl')
- excluded_extensions = ('.pdf',)
- # A list of tags indicating which wheels you want to match. The default
- # value of None matches against the tags compatible with the running
- # Python. If you want to match other values, set wheel_tags on a locator
- # instance to a list of tuples (pyver, abi, arch) which you want to match.
- wheel_tags = None
- downloadable_extensions = source_extensions + ('.whl',)
- def __init__(self, scheme='default'):
- """
- Initialise an instance.
- :param scheme: Because locators look for most recent versions, they
- need to know the version scheme to use. This specifies
- the current PEP-recommended scheme - use ``'legacy'``
- if you need to support existing distributions on PyPI.
- """
- self._cache = {}
- self.scheme = scheme
- # Because of bugs in some of the handlers on some of the platforms,
- # we use our own opener rather than just using urlopen.
- self.opener = build_opener(RedirectHandler())
- # If get_project() is called from locate(), the matcher instance
- # is set from the requirement passed to locate(). See issue #18 for
- # why this can be useful to know.
- self.matcher = None
- self.errors = queue.Queue()
- def get_errors(self):
- """
- Return any errors which have occurred.
- """
- result = []
- while not self.errors.empty(): # pragma: no cover
- try:
- e = self.errors.get(False)
- result.append(e)
- except self.errors.Empty:
- continue
- self.errors.task_done()
- return result
- def clear_errors(self):
- """
- Clear any errors which may have been logged.
- """
- # Just get the errors and throw them away
- self.get_errors()
- def clear_cache(self):
- self._cache.clear()
- def _get_scheme(self):
- return self._scheme
- def _set_scheme(self, value):
- self._scheme = value
- scheme = property(_get_scheme, _set_scheme)
- def _get_project(self, name):
- """
- For a given project, get a dictionary mapping available versions to Distribution
- instances.
- This should be implemented in subclasses.
- If called from a locate() request, self.matcher will be set to a
- matcher for the requirement to satisfy, otherwise it will be None.
- """
- raise NotImplementedError('Please implement in the subclass')
- def get_distribution_names(self):
- """
- Return all the distribution names known to this locator.
- """
- raise NotImplementedError('Please implement in the subclass')
- def get_project(self, name):
- """
- For a given project, get a dictionary mapping available versions to Distribution
- instances.
- This calls _get_project to do all the work, and just implements a caching layer on top.
- """
- if self._cache is None: # pragma: no cover
- result = self._get_project(name)
- elif name in self._cache:
- result = self._cache[name]
- else:
- self.clear_errors()
- result = self._get_project(name)
- self._cache[name] = result
- return result
- def score_url(self, url):
- """
- Give an url a score which can be used to choose preferred URLs
- for a given project release.
- """
- t = urlparse(url)
- basename = posixpath.basename(t.path)
- compatible = True
- is_wheel = basename.endswith('.whl')
- is_downloadable = basename.endswith(self.downloadable_extensions)
- if is_wheel:
- compatible = is_compatible(Wheel(basename), self.wheel_tags)
- return (t.scheme == 'https', 'pypi.org' in t.netloc,
- is_downloadable, is_wheel, compatible, basename)
- def prefer_url(self, url1, url2):
- """
- Choose one of two URLs where both are candidates for distribution
- archives for the same version of a distribution (for example,
- .tar.gz vs. zip).
- The current implementation favours https:// URLs over http://, archives
- from PyPI over those from other locations, wheel compatibility (if a
- wheel) and then the archive name.
- """
- result = url2
- if url1:
- s1 = self.score_url(url1)
- s2 = self.score_url(url2)
- if s1 > s2:
- result = url1
- if result != url2:
- logger.debug('Not replacing %r with %r', url1, url2)
- else:
- logger.debug('Replacing %r with %r', url1, url2)
- return result
- def split_filename(self, filename, project_name):
- """
- Attempt to split a filename in project name, version and Python version.
- """
- return split_filename(filename, project_name)
- def convert_url_to_download_info(self, url, project_name):
- """
- See if a URL is a candidate for a download URL for a project (the URL
- has typically been scraped from an HTML page).
- If it is, a dictionary is returned with keys "name", "version",
- "filename" and "url"; otherwise, None is returned.
- """
- def same_project(name1, name2):
- return normalize_name(name1) == normalize_name(name2)
- result = None
- scheme, netloc, path, params, query, frag = urlparse(url)
- if frag.lower().startswith('egg='): # pragma: no cover
- logger.debug('%s: version hint in fragment: %r',
- project_name, frag)
- m = HASHER_HASH.match(frag)
- if m:
- algo, digest = m.groups()
- else:
- algo, digest = None, None
- origpath = path
- if path and path[-1] == '/': # pragma: no cover
- path = path[:-1]
- if path.endswith('.whl'):
- try:
- wheel = Wheel(path)
- if not is_compatible(wheel, self.wheel_tags):
- logger.debug('Wheel not compatible: %s', path)
- else:
- if project_name is None:
- include = True
- else:
- include = same_project(wheel.name, project_name)
- if include:
- result = {
- 'name': wheel.name,
- 'version': wheel.version,
- 'filename': wheel.filename,
- 'url': urlunparse((scheme, netloc, origpath,
- params, query, '')),
- 'python-version': ', '.join(
- ['.'.join(list(v[2:])) for v in wheel.pyver]),
- }
- except Exception as e: # pragma: no cover
- logger.warning('invalid path for wheel: %s', path)
- elif not path.endswith(self.downloadable_extensions): # pragma: no cover
- logger.debug('Not downloadable: %s', path)
- else: # downloadable extension
- path = filename = posixpath.basename(path)
- for ext in self.downloadable_extensions:
- if path.endswith(ext):
- path = path[:-len(ext)]
- t = self.split_filename(path, project_name)
- if not t: # pragma: no cover
- logger.debug('No match for project/version: %s', path)
- else:
- name, version, pyver = t
- if not project_name or same_project(project_name, name):
- result = {
- 'name': name,
- 'version': version,
- 'filename': filename,
- 'url': urlunparse((scheme, netloc, origpath,
- params, query, '')),
- #'packagetype': 'sdist',
- }
- if pyver: # pragma: no cover
- result['python-version'] = pyver
- break
- if result and algo:
- result['%s_digest' % algo] = digest
- return result
- def _get_digest(self, info):
- """
- Get a digest from a dictionary by looking at a "digests" dictionary
- or keys of the form 'algo_digest'.
- Returns a 2-tuple (algo, digest) if found, else None. Currently
- looks only for SHA256, then MD5.
- """
- result = None
- if 'digests' in info:
- digests = info['digests']
- for algo in ('sha256', 'md5'):
- if algo in digests:
- result = (algo, digests[algo])
- break
- if not result:
- for algo in ('sha256', 'md5'):
- key = '%s_digest' % algo
- if key in info:
- result = (algo, info[key])
- break
- return result
- def _update_version_data(self, result, info):
- """
- Update a result dictionary (the final result from _get_project) with a
- dictionary for a specific version, which typically holds information
- gleaned from a filename or URL for an archive for the distribution.
- """
- name = info.pop('name')
- version = info.pop('version')
- if version in result:
- dist = result[version]
- md = dist.metadata
- else:
- dist = make_dist(name, version, scheme=self.scheme)
- md = dist.metadata
- dist.digest = digest = self._get_digest(info)
- url = info['url']
- result['digests'][url] = digest
- if md.source_url != info['url']:
- md.source_url = self.prefer_url(md.source_url, url)
- result['urls'].setdefault(version, set()).add(url)
- dist.locator = self
- result[version] = dist
- def locate(self, requirement, prereleases=False):
- """
- Find the most recent distribution which matches the given
- requirement.
- :param requirement: A requirement of the form 'foo (1.0)' or perhaps
- 'foo (>= 1.0, < 2.0, != 1.3)'
- :param prereleases: If ``True``, allow pre-release versions
- to be located. Otherwise, pre-release versions
- are not returned.
- :return: A :class:`Distribution` instance, or ``None`` if no such
- distribution could be located.
- """
- result = None
- r = parse_requirement(requirement)
- if r is None: # pragma: no cover
- raise DistlibException('Not a valid requirement: %r' % requirement)
- scheme = get_scheme(self.scheme)
- self.matcher = matcher = scheme.matcher(r.requirement)
- logger.debug('matcher: %s (%s)', matcher, type(matcher).__name__)
- versions = self.get_project(r.name)
- if len(versions) > 2: # urls and digests keys are present
- # sometimes, versions are invalid
- slist = []
- vcls = matcher.version_class
- for k in versions:
- if k in ('urls', 'digests'):
- continue
- try:
- if not matcher.match(k):
- logger.debug('%s did not match %r', matcher, k)
- else:
- if prereleases or not vcls(k).is_prerelease:
- slist.append(k)
- else:
- logger.debug('skipping pre-release '
- 'version %s of %s', k, matcher.name)
- except Exception: # pragma: no cover
- logger.warning('error matching %s with %r', matcher, k)
- pass # slist.append(k)
- if len(slist) > 1:
- slist = sorted(slist, key=scheme.key)
- if slist:
- logger.debug('sorted list: %s', slist)
- version = slist[-1]
- result = versions[version]
- if result:
- if r.extras:
- result.extras = r.extras
- result.download_urls = versions.get('urls', {}).get(version, set())
- d = {}
- sd = versions.get('digests', {})
- for url in result.download_urls:
- if url in sd: # pragma: no cover
- d[url] = sd[url]
- result.digests = d
- self.matcher = None
- return result
- class PyPIRPCLocator(Locator):
- """
- This locator uses XML-RPC to locate distributions. It therefore
- cannot be used with simple mirrors (that only mirror file content).
- """
- def __init__(self, url, **kwargs):
- """
- Initialise an instance.
- :param url: The URL to use for XML-RPC.
- :param kwargs: Passed to the superclass constructor.
- """
- super(PyPIRPCLocator, self).__init__(**kwargs)
- self.base_url = url
- self.client = ServerProxy(url, timeout=3.0)
- def get_distribution_names(self):
- """
- Return all the distribution names known to this locator.
- """
- return set(self.client.list_packages())
- def _get_project(self, name):
- result = {'urls': {}, 'digests': {}}
- versions = self.client.package_releases(name, True)
- for v in versions:
- urls = self.client.release_urls(name, v)
- data = self.client.release_data(name, v)
- metadata = Metadata(scheme=self.scheme)
- metadata.name = data['name']
- metadata.version = data['version']
- metadata.license = data.get('license')
- metadata.keywords = data.get('keywords', [])
- metadata.summary = data.get('summary')
- dist = Distribution(metadata)
- if urls:
- info = urls[0]
- metadata.source_url = info['url']
- dist.digest = self._get_digest(info)
- dist.locator = self
- result[v] = dist
- for info in urls:
- url = info['url']
- digest = self._get_digest(info)
- result['urls'].setdefault(v, set()).add(url)
- result['digests'][url] = digest
- return result
- class PyPIJSONLocator(Locator):
- """
- This locator uses PyPI's JSON interface. It's very limited in functionality
- and probably not worth using.
- """
- def __init__(self, url, **kwargs):
- super(PyPIJSONLocator, self).__init__(**kwargs)
- self.base_url = ensure_slash(url)
- def get_distribution_names(self):
- """
- Return all the distribution names known to this locator.
- """
- raise NotImplementedError('Not available from this locator')
- def _get_project(self, name):
- result = {'urls': {}, 'digests': {}}
- url = urljoin(self.base_url, '%s/json' % quote(name))
- try:
- resp = self.opener.open(url)
- data = resp.read().decode() # for now
- d = json.loads(data)
- md = Metadata(scheme=self.scheme)
- data = d['info']
- md.name = data['name']
- md.version = data['version']
- md.license = data.get('license')
- md.keywords = data.get('keywords', [])
- md.summary = data.get('summary')
- dist = Distribution(md)
- dist.locator = self
- urls = d['urls']
- result[md.version] = dist
- for info in d['urls']:
- url = info['url']
- dist.download_urls.add(url)
- dist.digests[url] = self._get_digest(info)
- result['urls'].setdefault(md.version, set()).add(url)
- result['digests'][url] = self._get_digest(info)
- # Now get other releases
- for version, infos in d['releases'].items():
- if version == md.version:
- continue # already done
- omd = Metadata(scheme=self.scheme)
- omd.name = md.name
- omd.version = version
- odist = Distribution(omd)
- odist.locator = self
- result[version] = odist
- for info in infos:
- url = info['url']
- odist.download_urls.add(url)
- odist.digests[url] = self._get_digest(info)
- result['urls'].setdefault(version, set()).add(url)
- result['digests'][url] = self._get_digest(info)
- # for info in urls:
- # md.source_url = info['url']
- # dist.digest = self._get_digest(info)
- # dist.locator = self
- # for info in urls:
- # url = info['url']
- # result['urls'].setdefault(md.version, set()).add(url)
- # result['digests'][url] = self._get_digest(info)
- except Exception as e:
- self.errors.put(text_type(e))
- logger.exception('JSON fetch failed: %s', e)
- return result
- class Page(object):
- """
- This class represents a scraped HTML page.
- """
- # The following slightly hairy-looking regex just looks for the contents of
- # an anchor link, which has an attribute "href" either immediately preceded
- # or immediately followed by a "rel" attribute. The attribute values can be
- # declared with double quotes, single quotes or no quotes - which leads to
- # the length of the expression.
- _href = re.compile("""
- (rel\\s*=\\s*(?:"(?P<rel1>[^"]*)"|'(?P<rel2>[^']*)'|(?P<rel3>[^>\\s\n]*))\\s+)?
- href\\s*=\\s*(?:"(?P<url1>[^"]*)"|'(?P<url2>[^']*)'|(?P<url3>[^>\\s\n]*))
- (\\s+rel\\s*=\\s*(?:"(?P<rel4>[^"]*)"|'(?P<rel5>[^']*)'|(?P<rel6>[^>\\s\n]*)))?
- """, re.I | re.S | re.X)
- _base = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I | re.S)
- def __init__(self, data, url):
- """
- Initialise an instance with the Unicode page contents and the URL they
- came from.
- """
- self.data = data
- self.base_url = self.url = url
- m = self._base.search(self.data)
- if m:
- self.base_url = m.group(1)
- _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
- @cached_property
- def links(self):
- """
- Return the URLs of all the links on a page together with information
- about their "rel" attribute, for determining which ones to treat as
- downloads and which ones to queue for further scraping.
- """
- def clean(url):
- "Tidy up an URL."
- scheme, netloc, path, params, query, frag = urlparse(url)
- return urlunparse((scheme, netloc, quote(path),
- params, query, frag))
- result = set()
- for match in self._href.finditer(self.data):
- d = match.groupdict('')
- rel = (d['rel1'] or d['rel2'] or d['rel3'] or
- d['rel4'] or d['rel5'] or d['rel6'])
- url = d['url1'] or d['url2'] or d['url3']
- url = urljoin(self.base_url, url)
- url = unescape(url)
- url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url)
- result.add((url, rel))
- # We sort the result, hoping to bring the most recent versions
- # to the front
- result = sorted(result, key=lambda t: t[0], reverse=True)
- return result
- class SimpleScrapingLocator(Locator):
- """
- A locator which scrapes HTML pages to locate downloads for a distribution.
- This runs multiple threads to do the I/O; performance is at least as good
- as pip's PackageFinder, which works in an analogous fashion.
- """
- # These are used to deal with various Content-Encoding schemes.
- decoders = {
- 'deflate': zlib.decompress,
- 'gzip': lambda b: gzip.GzipFile(fileobj=BytesIO(d)).read(),
- 'none': lambda b: b,
- }
- def __init__(self, url, timeout=None, num_workers=10, **kwargs):
- """
- Initialise an instance.
- :param url: The root URL to use for scraping.
- :param timeout: The timeout, in seconds, to be applied to requests.
- This defaults to ``None`` (no timeout specified).
- :param num_workers: The number of worker threads you want to do I/O,
- This defaults to 10.
- :param kwargs: Passed to the superclass.
- """
- super(SimpleScrapingLocator, self).__init__(**kwargs)
- self.base_url = ensure_slash(url)
- self.timeout = timeout
- self._page_cache = {}
- self._seen = set()
- self._to_fetch = queue.Queue()
- self._bad_hosts = set()
- self.skip_externals = False
- self.num_workers = num_workers
- self._lock = threading.RLock()
- # See issue #45: we need to be resilient when the locator is used
- # in a thread, e.g. with concurrent.futures. We can't use self._lock
- # as it is for coordinating our internal threads - the ones created
- # in _prepare_threads.
- self._gplock = threading.RLock()
- self.platform_check = False # See issue #112
- def _prepare_threads(self):
- """
- Threads are created only when get_project is called, and terminate
- before it returns. They are there primarily to parallelise I/O (i.e.
- fetching web pages).
- """
- self._threads = []
- for i in range(self.num_workers):
- t = threading.Thread(target=self._fetch)
- t.setDaemon(True)
- t.start()
- self._threads.append(t)
- def _wait_threads(self):
- """
- Tell all the threads to terminate (by sending a sentinel value) and
- wait for them to do so.
- """
- # Note that you need two loops, since you can't say which
- # thread will get each sentinel
- for t in self._threads:
- self._to_fetch.put(None) # sentinel
- for t in self._threads:
- t.join()
- self._threads = []
- def _get_project(self, name):
- result = {'urls': {}, 'digests': {}}
- with self._gplock:
- self.result = result
- self.project_name = name
- url = urljoin(self.base_url, '%s/' % quote(name))
- self._seen.clear()
- self._page_cache.clear()
- self._prepare_threads()
- try:
- logger.debug('Queueing %s', url)
- self._to_fetch.put(url)
- self._to_fetch.join()
- finally:
- self._wait_threads()
- del self.result
- return result
- platform_dependent = re.compile(r'\b(linux_(i\d86|x86_64|arm\w+)|'
- r'win(32|_amd64)|macosx_?\d+)\b', re.I)
- def _is_platform_dependent(self, url):
- """
- Does an URL refer to a platform-specific download?
- """
- return self.platform_dependent.search(url)
- def _process_download(self, url):
- """
- See if an URL is a suitable download for a project.
- If it is, register information in the result dictionary (for
- _get_project) about the specific version it's for.
- Note that the return value isn't actually used other than as a boolean
- value.
- """
- if self.platform_check and self._is_platform_dependent(url):
- info = None
- else:
- info = self.convert_url_to_download_info(url, self.project_name)
- logger.debug('process_download: %s -> %s', url, info)
- if info:
- with self._lock: # needed because self.result is shared
- self._update_version_data(self.result, info)
- return info
- def _should_queue(self, link, referrer, rel):
- """
- Determine whether a link URL from a referring page and with a
- particular "rel" attribute should be queued for scraping.
- """
- scheme, netloc, path, _, _, _ = urlparse(link)
- if path.endswith(self.source_extensions + self.binary_extensions +
- self.excluded_extensions):
- result = False
- elif self.skip_externals and not link.startswith(self.base_url):
- result = False
- elif not referrer.startswith(self.base_url):
- result = False
- elif rel not in ('homepage', 'download'):
- result = False
- elif scheme not in ('http', 'https', 'ftp'):
- result = False
- elif self._is_platform_dependent(link):
- result = False
- else:
- host = netloc.split(':', 1)[0]
- if host.lower() == 'localhost':
- result = False
- else:
- result = True
- logger.debug('should_queue: %s (%s) from %s -> %s', link, rel,
- referrer, result)
- return result
- def _fetch(self):
- """
- Get a URL to fetch from the work queue, get the HTML page, examine its
- links for download candidates and candidates for further scraping.
- This is a handy method to run in a thread.
- """
- while True:
- url = self._to_fetch.get()
- try:
- if url:
- page = self.get_page(url)
- if page is None: # e.g. after an error
- continue
- for link, rel in page.links:
- if link not in self._seen:
- try:
- self._seen.add(link)
- if (not self._process_download(link) and
- self._should_queue(link, url, rel)):
- logger.debug('Queueing %s from %s', link, url)
- self._to_fetch.put(link)
- except MetadataInvalidError: # e.g. invalid versions
- pass
- except Exception as e: # pragma: no cover
- self.errors.put(text_type(e))
- finally:
- # always do this, to avoid hangs :-)
- self._to_fetch.task_done()
- if not url:
- #logger.debug('Sentinel seen, quitting.')
- break
- def get_page(self, url):
- """
- Get the HTML for an URL, possibly from an in-memory cache.
- XXX TODO Note: this cache is never actually cleared. It's assumed that
- the data won't get stale over the lifetime of a locator instance (not
- necessarily true for the default_locator).
- """
- # http://peak.telecommunity.com/DevCenter/EasyInstall#package-index-api
- scheme, netloc, path, _, _, _ = urlparse(url)
- if scheme == 'file' and os.path.isdir(url2pathname(path)):
- url = urljoin(ensure_slash(url), 'index.html')
- if url in self._page_cache:
- result = self._page_cache[url]
- logger.debug('Returning %s from cache: %s', url, result)
- else:
- host = netloc.split(':', 1)[0]
- result = None
- if host in self._bad_hosts:
- logger.debug('Skipping %s due to bad host %s', url, host)
- else:
- req = Request(url, headers={'Accept-encoding': 'identity'})
- try:
- logger.debug('Fetching %s', url)
- resp = self.opener.open(req, timeout=self.timeout)
- logger.debug('Fetched %s', url)
- headers = resp.info()
- content_type = headers.get('Content-Type', '')
- if HTML_CONTENT_TYPE.match(content_type):
- final_url = resp.geturl()
- data = resp.read()
- encoding = headers.get('Content-Encoding')
- if encoding:
- decoder = self.decoders[encoding] # fail if not found
- data = decoder(data)
- encoding = 'utf-8'
- m = CHARSET.search(content_type)
- if m:
- encoding = m.group(1)
- try:
- data = data.decode(encoding)
- except UnicodeError: # pragma: no cover
- data = data.decode('latin-1') # fallback
- result = Page(data, final_url)
- self._page_cache[final_url] = result
- except HTTPError as e:
- if e.code != 404:
- logger.exception('Fetch failed: %s: %s', url, e)
- except URLError as e: # pragma: no cover
- logger.exception('Fetch failed: %s: %s', url, e)
- with self._lock:
- self._bad_hosts.add(host)
- except Exception as e: # pragma: no cover
- logger.exception('Fetch failed: %s: %s', url, e)
- finally:
- self._page_cache[url] = result # even if None (failure)
- return result
- _distname_re = re.compile('<a href=[^>]*>([^<]+)<')
- def get_distribution_names(self):
- """
- Return all the distribution names known to this locator.
- """
- result = set()
- page = self.get_page(self.base_url)
- if not page:
- raise DistlibException('Unable to get %s' % self.base_url)
- for match in self._distname_re.finditer(page.data):
- result.add(match.group(1))
- return result
- class DirectoryLocator(Locator):
- """
- This class locates distributions in a directory tree.
- """
- def __init__(self, path, **kwargs):
- """
- Initialise an instance.
- :param path: The root of the directory tree to search.
- :param kwargs: Passed to the superclass constructor,
- except for:
- * recursive - if True (the default), subdirectories are
- recursed into. If False, only the top-level directory
- is searched,
- """
- self.recursive = kwargs.pop('recursive', True)
- super(DirectoryLocator, self).__init__(**kwargs)
- path = os.path.abspath(path)
- if not os.path.isdir(path): # pragma: no cover
- raise DistlibException('Not a directory: %r' % path)
- self.base_dir = path
- def should_include(self, filename, parent):
- """
- Should a filename be considered as a candidate for a distribution
- archive? As well as the filename, the directory which contains it
- is provided, though not used by the current implementation.
- """
- return filename.endswith(self.downloadable_extensions)
- def _get_project(self, name):
- result = {'urls': {}, 'digests': {}}
- for root, dirs, files in os.walk(self.base_dir):
- for fn in files:
- if self.should_include(fn, root):
- fn = os.path.join(root, fn)
- url = urlunparse(('file', '',
- pathname2url(os.path.abspath(fn)),
- '', '', ''))
- info = self.convert_url_to_download_info(url, name)
- if info:
- self._update_version_data(result, info)
- if not self.recursive:
- break
- return result
- def get_distribution_names(self):
- """
- Return all the distribution names known to this locator.
- """
- result = set()
- for root, dirs, files in os.walk(self.base_dir):
- for fn in files:
- if self.should_include(fn, root):
- fn = os.path.join(root, fn)
- url = urlunparse(('file', '',
- pathname2url(os.path.abspath(fn)),
- '', '', ''))
- info = self.convert_url_to_download_info(url, None)
- if info:
- result.add(info['name'])
- if not self.recursive:
- break
- return result
- class JSONLocator(Locator):
- """
- This locator uses special extended metadata (not available on PyPI) and is
- the basis of performant dependency resolution in distlib. Other locators
- require archive downloads before dependencies can be determined! As you
- might imagine, that can be slow.
- """
- def get_distribution_names(self):
- """
- Return all the distribution names known to this locator.
- """
- raise NotImplementedError('Not available from this locator')
- def _get_project(self, name):
- result = {'urls': {}, 'digests': {}}
- data = get_project_data(name)
- if data:
- for info in data.get('files', []):
- if info['ptype'] != 'sdist' or info['pyversion'] != 'source':
- continue
- # We don't store summary in project metadata as it makes
- # the data bigger for no benefit during dependency
- # resolution
- dist = make_dist(data['name'], info['version'],
- summary=data.get('summary',
- 'Placeholder for summary'),
- scheme=self.scheme)
- md = dist.metadata
- md.source_url = info['url']
- # TODO SHA256 digest
- if 'digest' in info and info['digest']:
- dist.digest = ('md5', info['digest'])
- md.dependencies = info.get('requirements', {})
- dist.exports = info.get('exports', {})
- result[dist.version] = dist
- result['urls'].setdefault(dist.version, set()).add(info['url'])
- return result
- class DistPathLocator(Locator):
- """
- This locator finds installed distributions in a path. It can be useful for
- adding to an :class:`AggregatingLocator`.
- """
- def __init__(self, distpath, **kwargs):
- """
- Initialise an instance.
- :param distpath: A :class:`DistributionPath` instance to search.
- """
- super(DistPathLocator, self).__init__(**kwargs)
- assert isinstance(distpath, DistributionPath)
- self.distpath = distpath
- def _get_project(self, name):
- dist = self.distpath.get_distribution(name)
- if dist is None:
- result = {'urls': {}, 'digests': {}}
- else:
- result = {
- dist.version: dist,
- 'urls': {dist.version: set([dist.source_url])},
- 'digests': {dist.version: set([None])}
- }
- return result
- class AggregatingLocator(Locator):
- """
- This class allows you to chain and/or merge a list of locators.
- """
- def __init__(self, *locators, **kwargs):
- """
- Initialise an instance.
- :param locators: The list of locators to search.
- :param kwargs: Passed to the superclass constructor,
- except for:
- * merge - if False (the default), the first successful
- search from any of the locators is returned. If True,
- the results from all locators are merged (this can be
- slow).
- """
- self.merge = kwargs.pop('merge', False)
- self.locators = locators
- super(AggregatingLocator, self).__init__(**kwargs)
- def clear_cache(self):
- super(AggregatingLocator, self).clear_cache()
- for locator in self.locators:
- locator.clear_cache()
- def _set_scheme(self, value):
- self._scheme = value
- for locator in self.locators:
- locator.scheme = value
- scheme = property(Locator.scheme.fget, _set_scheme)
- def _get_project(self, name):
- result = {}
- for locator in self.locators:
- d = locator.get_project(name)
- if d:
- if self.merge:
- files = result.get('urls', {})
- digests = result.get('digests', {})
- # next line could overwrite result['urls'], result['digests']
- result.update(d)
- df = result.get('urls')
- if files and df:
- for k, v in files.items():
- if k in df:
- df[k] |= v
- else:
- df[k] = v
- dd = result.get('digests')
- if digests and dd:
- dd.update(digests)
- else:
- # See issue #18. If any dists are found and we're looking
- # for specific constraints, we only return something if
- # a match is found. For example, if a DirectoryLocator
- # returns just foo (1.0) while we're looking for
- # foo (>= 2.0), we'll pretend there was nothing there so
- # that subsequent locators can be queried. Otherwise we
- # would just return foo (1.0) which would then lead to a
- # failure to find foo (>= 2.0), because other locators
- # weren't searched. Note that this only matters when
- # merge=False.
- if self.matcher is None:
- found = True
- else:
- found = False
- for k in d:
- if self.matcher.match(k):
- found = True
- break
- if found:
- result = d
- break
- return result
- def get_distribution_names(self):
- """
- Return all the distribution names known to this locator.
- """
- result = set()
- for locator in self.locators:
- try:
- result |= locator.get_distribution_names()
- except NotImplementedError:
- pass
- return result
- # We use a legacy scheme simply because most of the dists on PyPI use legacy
- # versions which don't conform to PEP 426 / PEP 440.
- default_locator = AggregatingLocator(
- JSONLocator(),
- SimpleScrapingLocator('https://pypi.org/simple/',
- timeout=3.0),
- scheme='legacy')
- locate = default_locator.locate
- NAME_VERSION_RE = re.compile(r'(?P<name>[\w-]+)\s*'
- r'\(\s*(==\s*)?(?P<ver>[^)]+)\)$')
- class DependencyFinder(object):
- """
- Locate dependencies for distributions.
- """
- def __init__(self, locator=None):
- """
- Initialise an instance, using the specified locator
- to locate distributions.
- """
- self.locator = locator or default_locator
- self.scheme = get_scheme(self.locator.scheme)
- def add_distribution(self, dist):
- """
- Add a distribution to the finder. This will update internal information
- about who provides what.
- :param dist: The distribution to add.
- """
- logger.debug('adding distribution %s', dist)
- name = dist.key
- self.dists_by_name[name] = dist
- self.dists[(name, dist.version)] = dist
- for p in dist.provides:
- name, version = parse_name_and_version(p)
- logger.debug('Add to provided: %s, %s, %s', name, version, dist)
- self.provided.setdefault(name, set()).add((version, dist))
- def remove_distribution(self, dist):
- """
- Remove a distribution from the finder. This will update internal
- information about who provides what.
- :param dist: The distribution to remove.
- """
- logger.debug('removing distribution %s', dist)
- name = dist.key
- del self.dists_by_name[name]
- del self.dists[(name, dist.version)]
- for p in dist.provides:
- name, version = parse_name_and_version(p)
- logger.debug('Remove from provided: %s, %s, %s', name, version, dist)
- s = self.provided[name]
- s.remove((version, dist))
- if not s:
- del self.provided[name]
- def get_matcher(self, reqt):
- """
- Get a version matcher for a requirement.
- :param reqt: The requirement
- :type reqt: str
- :return: A version matcher (an instance of
- :class:`distlib.version.Matcher`).
- """
- try:
- matcher = self.scheme.matcher(reqt)
- except UnsupportedVersionError: # pragma: no cover
- # XXX compat-mode if cannot read the version
- name = reqt.split()[0]
- matcher = self.scheme.matcher(name)
- return matcher
- def find_providers(self, reqt):
- """
- Find the distributions which can fulfill a requirement.
- :param reqt: The requirement.
- :type reqt: str
- :return: A set of distribution which can fulfill the requirement.
- """
- matcher = self.get_matcher(reqt)
- name = matcher.key # case-insensitive
- result = set()
- provided = self.provided
- if name in provided:
- for version, provider in provided[name]:
- try:
- match = matcher.match(version)
- except UnsupportedVersionError:
- match = False
- if match:
- result.add(provider)
- break
- return result
- def try_to_replace(self, provider, other, problems):
- """
- Attempt to replace one provider with another. This is typically used
- when resolving dependencies from multiple sources, e.g. A requires
- (B >= 1.0) while C requires (B >= 1.1).
- For successful replacement, ``provider`` must meet all the requirements
- which ``other`` fulfills.
- :param provider: The provider we are trying to replace with.
- :param other: The provider we're trying to replace.
- :param problems: If False is returned, this will contain what
- problems prevented replacement. This is currently
- a tuple of the literal string 'cantreplace',
- ``provider``, ``other`` and the set of requirements
- that ``provider`` couldn't fulfill.
- :return: True if we can replace ``other`` with ``provider``, else
- False.
- """
- rlist = self.reqts[other]
- unmatched = set()
- for s in rlist:
- matcher = self.get_matcher(s)
- if not matcher.match(provider.version):
- unmatched.add(s)
- if unmatched:
- # can't replace other with provider
- problems.add(('cantreplace', provider, other,
- frozenset(unmatched)))
- result = False
- else:
- # can replace other with provider
- self.remove_distribution(other)
- del self.reqts[other]
- for s in rlist:
- self.reqts.setdefault(provider, set()).add(s)
- self.add_distribution(provider)
- result = True
- return result
- def find(self, requirement, meta_extras=None, prereleases=False):
- """
- Find a distribution and all distributions it depends on.
- :param requirement: The requirement specifying the distribution to
- find, or a Distribution instance.
- :param meta_extras: A list of meta extras such as :test:, :build: and
- so on.
- :param prereleases: If ``True``, allow pre-release versions to be
- returned - otherwise, don't return prereleases
- unless they're all that's available.
- Return a set of :class:`Distribution` instances and a set of
- problems.
- The distributions returned should be such that they have the
- :attr:`required` attribute set to ``True`` if they were
- from the ``requirement`` passed to ``find()``, and they have the
- :attr:`build_time_dependency` attribute set to ``True`` unless they
- are post-installation dependencies of the ``requirement``.
- The problems should be a tuple consisting of the string
- ``'unsatisfied'`` and the requirement which couldn't be satisfied
- by any distribution known to the locator.
- """
- self.provided = {}
- self.dists = {}
- self.dists_by_name = {}
- self.reqts = {}
- meta_extras = set(meta_extras or [])
- if ':*:' in meta_extras:
- meta_extras.remove(':*:')
- # :meta: and :run: are implicitly included
- meta_extras |= set([':test:', ':build:', ':dev:'])
- if isinstance(requirement, Distribution):
- dist = odist = requirement
- logger.debug('passed %s as requirement', odist)
- else:
- dist = odist = self.locator.locate(requirement,
- prereleases=prereleases)
- if dist is None:
- raise DistlibException('Unable to locate %r' % requirement)
- logger.debug('located %s', odist)
- dist.requested = True
- problems = set()
- todo = set([dist])
- install_dists = set([odist])
- while todo:
- dist = todo.pop()
- name = dist.key # case-insensitive
- if name not in self.dists_by_name:
- self.add_distribution(dist)
- else:
- #import pdb; pdb.set_trace()
- other = self.dists_by_name[name]
- if other != dist:
- self.try_to_replace(dist, other, problems)
- ireqts = dist.run_requires | dist.meta_requires
- sreqts = dist.build_requires
- ereqts = set()
- if meta_extras and dist in install_dists:
- for key in ('test', 'build', 'dev'):
- e = ':%s:' % key
- if e in meta_extras:
- ereqts |= getattr(dist, '%s_requires' % key)
- all_reqts = ireqts | sreqts | ereqts
- for r in all_reqts:
- providers = self.find_providers(r)
- if not providers:
- logger.debug('No providers found for %r', r)
- provider = self.locator.locate(r, prereleases=prereleases)
- # If no provider is found and we didn't consider
- # prereleases, consider them now.
- if provider is None and not prereleases:
- provider = self.locator.locate(r, prereleases=True)
- if provider is None:
- logger.debug('Cannot satisfy %r', r)
- problems.add(('unsatisfied', r))
- else:
- n, v = provider.key, provider.version
- if (n, v) not in self.dists:
- todo.add(provider)
- providers.add(provider)
- if r in ireqts and dist in install_dists:
- install_dists.add(provider)
- logger.debug('Adding %s to install_dists',
- provider.name_and_version)
- for p in providers:
- name = p.key
- if name not in self.dists_by_name:
- self.reqts.setdefault(p, set()).add(r)
- else:
- other = self.dists_by_name[name]
- if other != p:
- # see if other can be replaced by p
- self.try_to_replace(p, other, problems)
- dists = set(self.dists.values())
- for dist in dists:
- dist.build_time_dependency = dist not in install_dists
- if dist.build_time_dependency:
- logger.debug('%s is a build-time dependency only.',
- dist.name_and_version)
- logger.debug('find done for %s', odist)
- return dists, problems
|