cache.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. """Cache Management
  2. """
  3. # The following comment should be removed at some point in the future.
  4. # mypy: strict-optional=False
  5. import hashlib
  6. import json
  7. import logging
  8. import os
  9. from pip._vendor.packaging.tags import interpreter_name, interpreter_version
  10. from pip._vendor.packaging.utils import canonicalize_name
  11. from pip._internal.exceptions import InvalidWheelFilename
  12. from pip._internal.models.link import Link
  13. from pip._internal.models.wheel import Wheel
  14. from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds
  15. from pip._internal.utils.typing import MYPY_CHECK_RUNNING
  16. from pip._internal.utils.urls import path_to_url
  17. if MYPY_CHECK_RUNNING:
  18. from typing import Optional, Set, List, Any, Dict
  19. from pip._vendor.packaging.tags import Tag
  20. from pip._internal.models.format_control import FormatControl
  21. logger = logging.getLogger(__name__)
  22. def _hash_dict(d):
  23. # type: (Dict[str, str]) -> str
  24. """Return a stable sha224 of a dictionary."""
  25. s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
  26. return hashlib.sha224(s.encode("ascii")).hexdigest()
  27. class Cache(object):
  28. """An abstract class - provides cache directories for data from links
  29. :param cache_dir: The root of the cache.
  30. :param format_control: An object of FormatControl class to limit
  31. binaries being read from the cache.
  32. :param allowed_formats: which formats of files the cache should store.
  33. ('binary' and 'source' are the only allowed values)
  34. """
  35. def __init__(self, cache_dir, format_control, allowed_formats):
  36. # type: (str, FormatControl, Set[str]) -> None
  37. super(Cache, self).__init__()
  38. assert not cache_dir or os.path.isabs(cache_dir)
  39. self.cache_dir = cache_dir or None
  40. self.format_control = format_control
  41. self.allowed_formats = allowed_formats
  42. _valid_formats = {"source", "binary"}
  43. assert self.allowed_formats.union(_valid_formats) == _valid_formats
  44. def _get_cache_path_parts_legacy(self, link):
  45. # type: (Link) -> List[str]
  46. """Get parts of part that must be os.path.joined with cache_dir
  47. Legacy cache key (pip < 20) for compatibility with older caches.
  48. """
  49. # We want to generate an url to use as our cache key, we don't want to
  50. # just re-use the URL because it might have other items in the fragment
  51. # and we don't care about those.
  52. key_parts = [link.url_without_fragment]
  53. if link.hash_name is not None and link.hash is not None:
  54. key_parts.append("=".join([link.hash_name, link.hash]))
  55. key_url = "#".join(key_parts)
  56. # Encode our key url with sha224, we'll use this because it has similar
  57. # security properties to sha256, but with a shorter total output (and
  58. # thus less secure). However the differences don't make a lot of
  59. # difference for our use case here.
  60. hashed = hashlib.sha224(key_url.encode()).hexdigest()
  61. # We want to nest the directories some to prevent having a ton of top
  62. # level directories where we might run out of sub directories on some
  63. # FS.
  64. parts = [hashed[:2], hashed[2:4], hashed[4:6], hashed[6:]]
  65. return parts
  66. def _get_cache_path_parts(self, link):
  67. # type: (Link) -> List[str]
  68. """Get parts of part that must be os.path.joined with cache_dir
  69. """
  70. # We want to generate an url to use as our cache key, we don't want to
  71. # just re-use the URL because it might have other items in the fragment
  72. # and we don't care about those.
  73. key_parts = {"url": link.url_without_fragment}
  74. if link.hash_name is not None and link.hash is not None:
  75. key_parts[link.hash_name] = link.hash
  76. if link.subdirectory_fragment:
  77. key_parts["subdirectory"] = link.subdirectory_fragment
  78. # Include interpreter name, major and minor version in cache key
  79. # to cope with ill-behaved sdists that build a different wheel
  80. # depending on the python version their setup.py is being run on,
  81. # and don't encode the difference in compatibility tags.
  82. # https://github.com/pypa/pip/issues/7296
  83. key_parts["interpreter_name"] = interpreter_name()
  84. key_parts["interpreter_version"] = interpreter_version()
  85. # Encode our key url with sha224, we'll use this because it has similar
  86. # security properties to sha256, but with a shorter total output (and
  87. # thus less secure). However the differences don't make a lot of
  88. # difference for our use case here.
  89. hashed = _hash_dict(key_parts)
  90. # We want to nest the directories some to prevent having a ton of top
  91. # level directories where we might run out of sub directories on some
  92. # FS.
  93. parts = [hashed[:2], hashed[2:4], hashed[4:6], hashed[6:]]
  94. return parts
  95. def _get_candidates(self, link, canonical_package_name):
  96. # type: (Link, Optional[str]) -> List[Any]
  97. can_not_cache = (
  98. not self.cache_dir or
  99. not canonical_package_name or
  100. not link
  101. )
  102. if can_not_cache:
  103. return []
  104. formats = self.format_control.get_allowed_formats(
  105. canonical_package_name
  106. )
  107. if not self.allowed_formats.intersection(formats):
  108. return []
  109. candidates = []
  110. path = self.get_path_for_link(link)
  111. if os.path.isdir(path):
  112. for candidate in os.listdir(path):
  113. candidates.append((candidate, path))
  114. # TODO remove legacy path lookup in pip>=21
  115. legacy_path = self.get_path_for_link_legacy(link)
  116. if os.path.isdir(legacy_path):
  117. for candidate in os.listdir(legacy_path):
  118. candidates.append((candidate, legacy_path))
  119. return candidates
  120. def get_path_for_link_legacy(self, link):
  121. # type: (Link) -> str
  122. raise NotImplementedError()
  123. def get_path_for_link(self, link):
  124. # type: (Link) -> str
  125. """Return a directory to store cached items in for link.
  126. """
  127. raise NotImplementedError()
  128. def get(
  129. self,
  130. link, # type: Link
  131. package_name, # type: Optional[str]
  132. supported_tags, # type: List[Tag]
  133. ):
  134. # type: (...) -> Link
  135. """Returns a link to a cached item if it exists, otherwise returns the
  136. passed link.
  137. """
  138. raise NotImplementedError()
  139. class SimpleWheelCache(Cache):
  140. """A cache of wheels for future installs.
  141. """
  142. def __init__(self, cache_dir, format_control):
  143. # type: (str, FormatControl) -> None
  144. super(SimpleWheelCache, self).__init__(
  145. cache_dir, format_control, {"binary"}
  146. )
  147. def get_path_for_link_legacy(self, link):
  148. # type: (Link) -> str
  149. parts = self._get_cache_path_parts_legacy(link)
  150. return os.path.join(self.cache_dir, "wheels", *parts)
  151. def get_path_for_link(self, link):
  152. # type: (Link) -> str
  153. """Return a directory to store cached wheels for link
  154. Because there are M wheels for any one sdist, we provide a directory
  155. to cache them in, and then consult that directory when looking up
  156. cache hits.
  157. We only insert things into the cache if they have plausible version
  158. numbers, so that we don't contaminate the cache with things that were
  159. not unique. E.g. ./package might have dozens of installs done for it
  160. and build a version of 0.0...and if we built and cached a wheel, we'd
  161. end up using the same wheel even if the source has been edited.
  162. :param link: The link of the sdist for which this will cache wheels.
  163. """
  164. parts = self._get_cache_path_parts(link)
  165. # Store wheels within the root cache_dir
  166. return os.path.join(self.cache_dir, "wheels", *parts)
  167. def get(
  168. self,
  169. link, # type: Link
  170. package_name, # type: Optional[str]
  171. supported_tags, # type: List[Tag]
  172. ):
  173. # type: (...) -> Link
  174. candidates = []
  175. if not package_name:
  176. return link
  177. canonical_package_name = canonicalize_name(package_name)
  178. for wheel_name, wheel_dir in self._get_candidates(
  179. link, canonical_package_name
  180. ):
  181. try:
  182. wheel = Wheel(wheel_name)
  183. except InvalidWheelFilename:
  184. continue
  185. if canonicalize_name(wheel.name) != canonical_package_name:
  186. logger.debug(
  187. "Ignoring cached wheel {} for {} as it "
  188. "does not match the expected distribution name {}.".format(
  189. wheel_name, link, package_name
  190. )
  191. )
  192. continue
  193. if not wheel.supported(supported_tags):
  194. # Built for a different python/arch/etc
  195. continue
  196. candidates.append(
  197. (
  198. wheel.support_index_min(supported_tags),
  199. wheel_name,
  200. wheel_dir,
  201. )
  202. )
  203. if not candidates:
  204. return link
  205. _, wheel_name, wheel_dir = min(candidates)
  206. return Link(path_to_url(os.path.join(wheel_dir, wheel_name)))
  207. class EphemWheelCache(SimpleWheelCache):
  208. """A SimpleWheelCache that creates it's own temporary cache directory
  209. """
  210. def __init__(self, format_control):
  211. # type: (FormatControl) -> None
  212. self._temp_dir = TempDirectory(
  213. kind=tempdir_kinds.EPHEM_WHEEL_CACHE,
  214. globally_managed=True,
  215. )
  216. super(EphemWheelCache, self).__init__(
  217. self._temp_dir.path, format_control
  218. )
  219. class CacheEntry(object):
  220. def __init__(
  221. self,
  222. link, # type: Link
  223. persistent, # type: bool
  224. ):
  225. self.link = link
  226. self.persistent = persistent
  227. class WheelCache(Cache):
  228. """Wraps EphemWheelCache and SimpleWheelCache into a single Cache
  229. This Cache allows for gracefully degradation, using the ephem wheel cache
  230. when a certain link is not found in the simple wheel cache first.
  231. """
  232. def __init__(self, cache_dir, format_control):
  233. # type: (str, FormatControl) -> None
  234. super(WheelCache, self).__init__(
  235. cache_dir, format_control, {'binary'}
  236. )
  237. self._wheel_cache = SimpleWheelCache(cache_dir, format_control)
  238. self._ephem_cache = EphemWheelCache(format_control)
  239. def get_path_for_link_legacy(self, link):
  240. # type: (Link) -> str
  241. return self._wheel_cache.get_path_for_link_legacy(link)
  242. def get_path_for_link(self, link):
  243. # type: (Link) -> str
  244. return self._wheel_cache.get_path_for_link(link)
  245. def get_ephem_path_for_link(self, link):
  246. # type: (Link) -> str
  247. return self._ephem_cache.get_path_for_link(link)
  248. def get(
  249. self,
  250. link, # type: Link
  251. package_name, # type: Optional[str]
  252. supported_tags, # type: List[Tag]
  253. ):
  254. # type: (...) -> Link
  255. cache_entry = self.get_cache_entry(link, package_name, supported_tags)
  256. if cache_entry is None:
  257. return link
  258. return cache_entry.link
  259. def get_cache_entry(
  260. self,
  261. link, # type: Link
  262. package_name, # type: Optional[str]
  263. supported_tags, # type: List[Tag]
  264. ):
  265. # type: (...) -> Optional[CacheEntry]
  266. """Returns a CacheEntry with a link to a cached item if it exists or
  267. None. The cache entry indicates if the item was found in the persistent
  268. or ephemeral cache.
  269. """
  270. retval = self._wheel_cache.get(
  271. link=link,
  272. package_name=package_name,
  273. supported_tags=supported_tags,
  274. )
  275. if retval is not link:
  276. return CacheEntry(retval, persistent=True)
  277. retval = self._ephem_cache.get(
  278. link=link,
  279. package_name=package_name,
  280. supported_tags=supported_tags,
  281. )
  282. if retval is not link:
  283. return CacheEntry(retval, persistent=False)
  284. return None