download.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. """Download files with progress indicators.
  2. """
  3. import cgi
  4. import logging
  5. import mimetypes
  6. import os
  7. from pip._vendor import requests
  8. from pip._vendor.requests.models import CONTENT_CHUNK_SIZE
  9. from pip._internal.cli.progress_bars import DownloadProgressProvider
  10. from pip._internal.models.index import PyPI
  11. from pip._internal.network.cache import is_from_cache
  12. from pip._internal.network.utils import response_chunks
  13. from pip._internal.utils.misc import (
  14. format_size,
  15. redact_auth_from_url,
  16. splitext,
  17. )
  18. from pip._internal.utils.typing import MYPY_CHECK_RUNNING
  19. if MYPY_CHECK_RUNNING:
  20. from typing import Iterable, Optional
  21. from pip._vendor.requests.models import Response
  22. from pip._internal.models.link import Link
  23. from pip._internal.network.session import PipSession
  24. logger = logging.getLogger(__name__)
  25. def _get_http_response_size(resp):
  26. # type: (Response) -> Optional[int]
  27. try:
  28. return int(resp.headers['content-length'])
  29. except (ValueError, KeyError, TypeError):
  30. return None
  31. def _prepare_download(
  32. resp, # type: Response
  33. link, # type: Link
  34. progress_bar # type: str
  35. ):
  36. # type: (...) -> Iterable[bytes]
  37. total_length = _get_http_response_size(resp)
  38. if link.netloc == PyPI.file_storage_domain:
  39. url = link.show_url
  40. else:
  41. url = link.url_without_fragment
  42. logged_url = redact_auth_from_url(url)
  43. if total_length:
  44. logged_url = '{} ({})'.format(logged_url, format_size(total_length))
  45. if is_from_cache(resp):
  46. logger.info("Using cached %s", logged_url)
  47. else:
  48. logger.info("Downloading %s", logged_url)
  49. if logger.getEffectiveLevel() > logging.INFO:
  50. show_progress = False
  51. elif is_from_cache(resp):
  52. show_progress = False
  53. elif not total_length:
  54. show_progress = True
  55. elif total_length > (40 * 1000):
  56. show_progress = True
  57. else:
  58. show_progress = False
  59. chunks = response_chunks(resp, CONTENT_CHUNK_SIZE)
  60. if not show_progress:
  61. return chunks
  62. return DownloadProgressProvider(
  63. progress_bar, max=total_length
  64. )(chunks)
  65. def sanitize_content_filename(filename):
  66. # type: (str) -> str
  67. """
  68. Sanitize the "filename" value from a Content-Disposition header.
  69. """
  70. return os.path.basename(filename)
  71. def parse_content_disposition(content_disposition, default_filename):
  72. # type: (str, str) -> str
  73. """
  74. Parse the "filename" value from a Content-Disposition header, and
  75. return the default filename if the result is empty.
  76. """
  77. _type, params = cgi.parse_header(content_disposition)
  78. filename = params.get('filename')
  79. if filename:
  80. # We need to sanitize the filename to prevent directory traversal
  81. # in case the filename contains ".." path parts.
  82. filename = sanitize_content_filename(filename)
  83. return filename or default_filename
  84. def _get_http_response_filename(resp, link):
  85. # type: (Response, Link) -> str
  86. """Get an ideal filename from the given HTTP response, falling back to
  87. the link filename if not provided.
  88. """
  89. filename = link.filename # fallback
  90. # Have a look at the Content-Disposition header for a better guess
  91. content_disposition = resp.headers.get('content-disposition')
  92. if content_disposition:
  93. filename = parse_content_disposition(content_disposition, filename)
  94. ext = splitext(filename)[1] # type: Optional[str]
  95. if not ext:
  96. ext = mimetypes.guess_extension(
  97. resp.headers.get('content-type', '')
  98. )
  99. if ext:
  100. filename += ext
  101. if not ext and link.url != resp.url:
  102. ext = os.path.splitext(resp.url)[1]
  103. if ext:
  104. filename += ext
  105. return filename
  106. def _http_get_download(session, link):
  107. # type: (PipSession, Link) -> Response
  108. target_url = link.url.split('#', 1)[0]
  109. resp = session.get(
  110. target_url,
  111. # We use Accept-Encoding: identity here because requests
  112. # defaults to accepting compressed responses. This breaks in
  113. # a variety of ways depending on how the server is configured.
  114. # - Some servers will notice that the file isn't a compressible
  115. # file and will leave the file alone and with an empty
  116. # Content-Encoding
  117. # - Some servers will notice that the file is already
  118. # compressed and will leave the file alone and will add a
  119. # Content-Encoding: gzip header
  120. # - Some servers won't notice anything at all and will take
  121. # a file that's already been compressed and compress it again
  122. # and set the Content-Encoding: gzip header
  123. # By setting this to request only the identity encoding We're
  124. # hoping to eliminate the third case. Hopefully there does not
  125. # exist a server which when given a file will notice it is
  126. # already compressed and that you're not asking for a
  127. # compressed file and will then decompress it before sending
  128. # because if that's the case I don't think it'll ever be
  129. # possible to make this work.
  130. headers={"Accept-Encoding": "identity"},
  131. stream=True,
  132. )
  133. resp.raise_for_status()
  134. return resp
  135. class Download(object):
  136. def __init__(
  137. self,
  138. response, # type: Response
  139. filename, # type: str
  140. chunks, # type: Iterable[bytes]
  141. ):
  142. # type: (...) -> None
  143. self.response = response
  144. self.filename = filename
  145. self.chunks = chunks
  146. class Downloader(object):
  147. def __init__(
  148. self,
  149. session, # type: PipSession
  150. progress_bar, # type: str
  151. ):
  152. # type: (...) -> None
  153. self._session = session
  154. self._progress_bar = progress_bar
  155. def __call__(self, link):
  156. # type: (Link) -> Download
  157. try:
  158. resp = _http_get_download(self._session, link)
  159. except requests.HTTPError as e:
  160. logger.critical(
  161. "HTTP error %s while getting %s", e.response.status_code, link
  162. )
  163. raise
  164. return Download(
  165. resp,
  166. _get_http_response_filename(resp, link),
  167. _prepare_download(resp, link, self._progress_bar),
  168. )