download.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. """Download files with progress indicators.
  2. """
  3. import cgi
  4. import logging
  5. import mimetypes
  6. import os
  7. from pip._vendor.requests.models import CONTENT_CHUNK_SIZE
  8. from pip._internal.cli.progress_bars import DownloadProgressProvider
  9. from pip._internal.exceptions import NetworkConnectionError
  10. from pip._internal.models.index import PyPI
  11. from pip._internal.network.cache import is_from_cache
  12. from pip._internal.network.utils import (
  13. HEADERS,
  14. raise_for_status,
  15. response_chunks,
  16. )
  17. from pip._internal.utils.misc import (
  18. format_size,
  19. redact_auth_from_url,
  20. splitext,
  21. )
  22. from pip._internal.utils.typing import MYPY_CHECK_RUNNING
  23. if MYPY_CHECK_RUNNING:
  24. from typing import Iterable, Optional
  25. from pip._vendor.requests.models import Response
  26. from pip._internal.models.link import Link
  27. from pip._internal.network.session import PipSession
  28. logger = logging.getLogger(__name__)
  29. def _get_http_response_size(resp):
  30. # type: (Response) -> Optional[int]
  31. try:
  32. return int(resp.headers['content-length'])
  33. except (ValueError, KeyError, TypeError):
  34. return None
  35. def _prepare_download(
  36. resp, # type: Response
  37. link, # type: Link
  38. progress_bar # type: str
  39. ):
  40. # type: (...) -> Iterable[bytes]
  41. total_length = _get_http_response_size(resp)
  42. if link.netloc == PyPI.file_storage_domain:
  43. url = link.show_url
  44. else:
  45. url = link.url_without_fragment
  46. logged_url = redact_auth_from_url(url)
  47. if total_length:
  48. logged_url = '{} ({})'.format(logged_url, format_size(total_length))
  49. if is_from_cache(resp):
  50. logger.info("Using cached %s", logged_url)
  51. else:
  52. logger.info("Downloading %s", logged_url)
  53. if logger.getEffectiveLevel() > logging.INFO:
  54. show_progress = False
  55. elif is_from_cache(resp):
  56. show_progress = False
  57. elif not total_length:
  58. show_progress = True
  59. elif total_length > (40 * 1000):
  60. show_progress = True
  61. else:
  62. show_progress = False
  63. chunks = response_chunks(resp, CONTENT_CHUNK_SIZE)
  64. if not show_progress:
  65. return chunks
  66. return DownloadProgressProvider(
  67. progress_bar, max=total_length
  68. )(chunks)
  69. def sanitize_content_filename(filename):
  70. # type: (str) -> str
  71. """
  72. Sanitize the "filename" value from a Content-Disposition header.
  73. """
  74. return os.path.basename(filename)
  75. def parse_content_disposition(content_disposition, default_filename):
  76. # type: (str, str) -> str
  77. """
  78. Parse the "filename" value from a Content-Disposition header, and
  79. return the default filename if the result is empty.
  80. """
  81. _type, params = cgi.parse_header(content_disposition)
  82. filename = params.get('filename')
  83. if filename:
  84. # We need to sanitize the filename to prevent directory traversal
  85. # in case the filename contains ".." path parts.
  86. filename = sanitize_content_filename(filename)
  87. return filename or default_filename
  88. def _get_http_response_filename(resp, link):
  89. # type: (Response, Link) -> str
  90. """Get an ideal filename from the given HTTP response, falling back to
  91. the link filename if not provided.
  92. """
  93. filename = link.filename # fallback
  94. # Have a look at the Content-Disposition header for a better guess
  95. content_disposition = resp.headers.get('content-disposition')
  96. if content_disposition:
  97. filename = parse_content_disposition(content_disposition, filename)
  98. ext = splitext(filename)[1] # type: Optional[str]
  99. if not ext:
  100. ext = mimetypes.guess_extension(
  101. resp.headers.get('content-type', '')
  102. )
  103. if ext:
  104. filename += ext
  105. if not ext and link.url != resp.url:
  106. ext = os.path.splitext(resp.url)[1]
  107. if ext:
  108. filename += ext
  109. return filename
  110. def _http_get_download(session, link):
  111. # type: (PipSession, Link) -> Response
  112. target_url = link.url.split('#', 1)[0]
  113. resp = session.get(target_url, headers=HEADERS, stream=True)
  114. raise_for_status(resp)
  115. return resp
  116. class Download(object):
  117. def __init__(
  118. self,
  119. response, # type: Response
  120. filename, # type: str
  121. chunks, # type: Iterable[bytes]
  122. ):
  123. # type: (...) -> None
  124. self.response = response
  125. self.filename = filename
  126. self.chunks = chunks
  127. class Downloader(object):
  128. def __init__(
  129. self,
  130. session, # type: PipSession
  131. progress_bar, # type: str
  132. ):
  133. # type: (...) -> None
  134. self._session = session
  135. self._progress_bar = progress_bar
  136. def __call__(self, link):
  137. # type: (Link) -> Download
  138. try:
  139. resp = _http_get_download(self._session, link)
  140. except NetworkConnectionError as e:
  141. assert e.response is not None
  142. logger.critical(
  143. "HTTP error %s while getting %s", e.response.status_code, link
  144. )
  145. raise
  146. return Download(
  147. resp,
  148. _get_http_response_filename(resp, link),
  149. _prepare_download(resp, link, self._progress_bar),
  150. )