download.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196
  1. """Download files with progress indicators.
  2. """
  3. import cgi
  4. import logging
  5. import mimetypes
  6. import os
  7. from typing import Iterable, Optional, Tuple
  8. from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response
  9. from pip._internal.cli.progress_bars import DownloadProgressProvider
  10. from pip._internal.exceptions import NetworkConnectionError
  11. from pip._internal.models.index import PyPI
  12. from pip._internal.models.link import Link
  13. from pip._internal.network.cache import is_from_cache
  14. from pip._internal.network.session import PipSession
  15. from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
  16. from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext
  17. logger = logging.getLogger(__name__)
  18. def _get_http_response_size(resp):
  19. # type: (Response) -> Optional[int]
  20. try:
  21. return int(resp.headers['content-length'])
  22. except (ValueError, KeyError, TypeError):
  23. return None
  24. def _prepare_download(
  25. resp, # type: Response
  26. link, # type: Link
  27. progress_bar # type: str
  28. ):
  29. # type: (...) -> Iterable[bytes]
  30. total_length = _get_http_response_size(resp)
  31. if link.netloc == PyPI.file_storage_domain:
  32. url = link.show_url
  33. else:
  34. url = link.url_without_fragment
  35. logged_url = redact_auth_from_url(url)
  36. if total_length:
  37. logged_url = '{} ({})'.format(logged_url, format_size(total_length))
  38. if is_from_cache(resp):
  39. logger.info("Using cached %s", logged_url)
  40. else:
  41. logger.info("Downloading %s", logged_url)
  42. if logger.getEffectiveLevel() > logging.INFO:
  43. show_progress = False
  44. elif is_from_cache(resp):
  45. show_progress = False
  46. elif not total_length:
  47. show_progress = True
  48. elif total_length > (40 * 1000):
  49. show_progress = True
  50. else:
  51. show_progress = False
  52. chunks = response_chunks(resp, CONTENT_CHUNK_SIZE)
  53. if not show_progress:
  54. return chunks
  55. return DownloadProgressProvider(
  56. progress_bar, max=total_length
  57. )(chunks)
  58. def sanitize_content_filename(filename):
  59. # type: (str) -> str
  60. """
  61. Sanitize the "filename" value from a Content-Disposition header.
  62. """
  63. return os.path.basename(filename)
  64. def parse_content_disposition(content_disposition, default_filename):
  65. # type: (str, str) -> str
  66. """
  67. Parse the "filename" value from a Content-Disposition header, and
  68. return the default filename if the result is empty.
  69. """
  70. _type, params = cgi.parse_header(content_disposition)
  71. filename = params.get('filename')
  72. if filename:
  73. # We need to sanitize the filename to prevent directory traversal
  74. # in case the filename contains ".." path parts.
  75. filename = sanitize_content_filename(filename)
  76. return filename or default_filename
  77. def _get_http_response_filename(resp, link):
  78. # type: (Response, Link) -> str
  79. """Get an ideal filename from the given HTTP response, falling back to
  80. the link filename if not provided.
  81. """
  82. filename = link.filename # fallback
  83. # Have a look at the Content-Disposition header for a better guess
  84. content_disposition = resp.headers.get('content-disposition')
  85. if content_disposition:
  86. filename = parse_content_disposition(content_disposition, filename)
  87. ext = splitext(filename)[1] # type: Optional[str]
  88. if not ext:
  89. ext = mimetypes.guess_extension(
  90. resp.headers.get('content-type', '')
  91. )
  92. if ext:
  93. filename += ext
  94. if not ext and link.url != resp.url:
  95. ext = os.path.splitext(resp.url)[1]
  96. if ext:
  97. filename += ext
  98. return filename
  99. def _http_get_download(session, link):
  100. # type: (PipSession, Link) -> Response
  101. target_url = link.url.split('#', 1)[0]
  102. resp = session.get(target_url, headers=HEADERS, stream=True)
  103. raise_for_status(resp)
  104. return resp
  105. class Downloader:
  106. def __init__(
  107. self,
  108. session, # type: PipSession
  109. progress_bar, # type: str
  110. ):
  111. # type: (...) -> None
  112. self._session = session
  113. self._progress_bar = progress_bar
  114. def __call__(self, link, location):
  115. # type: (Link, str) -> Tuple[str, str]
  116. """Download the file given by link into location."""
  117. try:
  118. resp = _http_get_download(self._session, link)
  119. except NetworkConnectionError as e:
  120. assert e.response is not None
  121. logger.critical(
  122. "HTTP error %s while getting %s", e.response.status_code, link
  123. )
  124. raise
  125. filename = _get_http_response_filename(resp, link)
  126. filepath = os.path.join(location, filename)
  127. chunks = _prepare_download(resp, link, self._progress_bar)
  128. with open(filepath, 'wb') as content_file:
  129. for chunk in chunks:
  130. content_file.write(chunk)
  131. content_type = resp.headers.get('Content-Type', '')
  132. return filepath, content_type
  133. class BatchDownloader:
  134. def __init__(
  135. self,
  136. session, # type: PipSession
  137. progress_bar, # type: str
  138. ):
  139. # type: (...) -> None
  140. self._session = session
  141. self._progress_bar = progress_bar
  142. def __call__(self, links, location):
  143. # type: (Iterable[Link], str) -> Iterable[Tuple[Link, Tuple[str, str]]]
  144. """Download the files given by links into location."""
  145. for link in links:
  146. try:
  147. resp = _http_get_download(self._session, link)
  148. except NetworkConnectionError as e:
  149. assert e.response is not None
  150. logger.critical(
  151. "HTTP error %s while getting %s",
  152. e.response.status_code, link,
  153. )
  154. raise
  155. filename = _get_http_response_filename(resp, link)
  156. filepath = os.path.join(location, filename)
  157. chunks = _prepare_download(resp, link, self._progress_bar)
  158. with open(filepath, 'wb') as content_file:
  159. for chunk in chunks:
  160. content_file.write(chunk)
  161. content_type = resp.headers.get('Content-Type', '')
  162. yield link, (filepath, content_type)