123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584 |
- # -*- coding: utf-8 -*-
- """
- werkzeug.formparser
- ~~~~~~~~~~~~~~~~~~~
- This module implements the form parsing. It supports url-encoded forms
- as well as non-nested multipart uploads.
- :copyright: 2007 Pallets
- :license: BSD-3-Clause
- """
- import codecs
- import re
- from functools import update_wrapper
- from itertools import chain
- from itertools import repeat
- from itertools import tee
- from . import exceptions
- from ._compat import BytesIO
- from ._compat import text_type
- from ._compat import to_native
- from .datastructures import FileStorage
- from .datastructures import Headers
- from .datastructures import MultiDict
- from .http import parse_options_header
- from .urls import url_decode_stream
- from .wsgi import get_content_length
- from .wsgi import get_input_stream
- from .wsgi import make_line_iter
- # there are some platforms where SpooledTemporaryFile is not available.
- # In that case we need to provide a fallback.
- try:
- from tempfile import SpooledTemporaryFile
- except ImportError:
- from tempfile import TemporaryFile
- SpooledTemporaryFile = None
- #: an iterator that yields empty strings
- _empty_string_iter = repeat("")
- #: a regular expression for multipart boundaries
- _multipart_boundary_re = re.compile("^[ -~]{0,200}[!-~]$")
- #: supported http encodings that are also available in python we support
- #: for multipart messages.
- _supported_multipart_encodings = frozenset(["base64", "quoted-printable"])
- def default_stream_factory(
- total_content_length, filename, content_type, content_length=None
- ):
- """The stream factory that is used per default."""
- max_size = 1024 * 500
- if SpooledTemporaryFile is not None:
- return SpooledTemporaryFile(max_size=max_size, mode="wb+")
- if total_content_length is None or total_content_length > max_size:
- return TemporaryFile("wb+")
- return BytesIO()
- def parse_form_data(
- environ,
- stream_factory=None,
- charset="utf-8",
- errors="replace",
- max_form_memory_size=None,
- max_content_length=None,
- cls=None,
- silent=True,
- ):
- """Parse the form data in the environ and return it as tuple in the form
- ``(stream, form, files)``. You should only call this method if the
- transport method is `POST`, `PUT`, or `PATCH`.
- If the mimetype of the data transmitted is `multipart/form-data` the
- files multidict will be filled with `FileStorage` objects. If the
- mimetype is unknown the input stream is wrapped and returned as first
- argument, else the stream is empty.
- This is a shortcut for the common usage of :class:`FormDataParser`.
- Have a look at :ref:`dealing-with-request-data` for more details.
- .. versionadded:: 0.5
- The `max_form_memory_size`, `max_content_length` and
- `cls` parameters were added.
- .. versionadded:: 0.5.1
- The optional `silent` flag was added.
- :param environ: the WSGI environment to be used for parsing.
- :param stream_factory: An optional callable that returns a new read and
- writeable file descriptor. This callable works
- the same as :meth:`~BaseResponse._get_file_stream`.
- :param charset: The character set for URL and url encoded form data.
- :param errors: The encoding error behavior.
- :param max_form_memory_size: the maximum number of bytes to be accepted for
- in-memory stored form data. If the data
- exceeds the value specified an
- :exc:`~exceptions.RequestEntityTooLarge`
- exception is raised.
- :param max_content_length: If this is provided and the transmitted data
- is longer than this value an
- :exc:`~exceptions.RequestEntityTooLarge`
- exception is raised.
- :param cls: an optional dict class to use. If this is not specified
- or `None` the default :class:`MultiDict` is used.
- :param silent: If set to False parsing errors will not be caught.
- :return: A tuple in the form ``(stream, form, files)``.
- """
- return FormDataParser(
- stream_factory,
- charset,
- errors,
- max_form_memory_size,
- max_content_length,
- cls,
- silent,
- ).parse_from_environ(environ)
- def exhaust_stream(f):
- """Helper decorator for methods that exhausts the stream on return."""
- def wrapper(self, stream, *args, **kwargs):
- try:
- return f(self, stream, *args, **kwargs)
- finally:
- exhaust = getattr(stream, "exhaust", None)
- if exhaust is not None:
- exhaust()
- else:
- while 1:
- chunk = stream.read(1024 * 64)
- if not chunk:
- break
- return update_wrapper(wrapper, f)
- class FormDataParser(object):
- """This class implements parsing of form data for Werkzeug. By itself
- it can parse multipart and url encoded form data. It can be subclassed
- and extended but for most mimetypes it is a better idea to use the
- untouched stream and expose it as separate attributes on a request
- object.
- .. versionadded:: 0.8
- :param stream_factory: An optional callable that returns a new read and
- writeable file descriptor. This callable works
- the same as :meth:`~BaseResponse._get_file_stream`.
- :param charset: The character set for URL and url encoded form data.
- :param errors: The encoding error behavior.
- :param max_form_memory_size: the maximum number of bytes to be accepted for
- in-memory stored form data. If the data
- exceeds the value specified an
- :exc:`~exceptions.RequestEntityTooLarge`
- exception is raised.
- :param max_content_length: If this is provided and the transmitted data
- is longer than this value an
- :exc:`~exceptions.RequestEntityTooLarge`
- exception is raised.
- :param cls: an optional dict class to use. If this is not specified
- or `None` the default :class:`MultiDict` is used.
- :param silent: If set to False parsing errors will not be caught.
- """
- def __init__(
- self,
- stream_factory=None,
- charset="utf-8",
- errors="replace",
- max_form_memory_size=None,
- max_content_length=None,
- cls=None,
- silent=True,
- ):
- if stream_factory is None:
- stream_factory = default_stream_factory
- self.stream_factory = stream_factory
- self.charset = charset
- self.errors = errors
- self.max_form_memory_size = max_form_memory_size
- self.max_content_length = max_content_length
- if cls is None:
- cls = MultiDict
- self.cls = cls
- self.silent = silent
- def get_parse_func(self, mimetype, options):
- return self.parse_functions.get(mimetype)
- def parse_from_environ(self, environ):
- """Parses the information from the environment as form data.
- :param environ: the WSGI environment to be used for parsing.
- :return: A tuple in the form ``(stream, form, files)``.
- """
- content_type = environ.get("CONTENT_TYPE", "")
- content_length = get_content_length(environ)
- mimetype, options = parse_options_header(content_type)
- return self.parse(get_input_stream(environ), mimetype, content_length, options)
- def parse(self, stream, mimetype, content_length, options=None):
- """Parses the information from the given stream, mimetype,
- content length and mimetype parameters.
- :param stream: an input stream
- :param mimetype: the mimetype of the data
- :param content_length: the content length of the incoming data
- :param options: optional mimetype parameters (used for
- the multipart boundary for instance)
- :return: A tuple in the form ``(stream, form, files)``.
- """
- if (
- self.max_content_length is not None
- and content_length is not None
- and content_length > self.max_content_length
- ):
- raise exceptions.RequestEntityTooLarge()
- if options is None:
- options = {}
- parse_func = self.get_parse_func(mimetype, options)
- if parse_func is not None:
- try:
- return parse_func(self, stream, mimetype, content_length, options)
- except ValueError:
- if not self.silent:
- raise
- return stream, self.cls(), self.cls()
- @exhaust_stream
- def _parse_multipart(self, stream, mimetype, content_length, options):
- parser = MultiPartParser(
- self.stream_factory,
- self.charset,
- self.errors,
- max_form_memory_size=self.max_form_memory_size,
- cls=self.cls,
- )
- boundary = options.get("boundary")
- if boundary is None:
- raise ValueError("Missing boundary")
- if isinstance(boundary, text_type):
- boundary = boundary.encode("ascii")
- form, files = parser.parse(stream, boundary, content_length)
- return stream, form, files
- @exhaust_stream
- def _parse_urlencoded(self, stream, mimetype, content_length, options):
- if (
- self.max_form_memory_size is not None
- and content_length is not None
- and content_length > self.max_form_memory_size
- ):
- raise exceptions.RequestEntityTooLarge()
- form = url_decode_stream(stream, self.charset, errors=self.errors, cls=self.cls)
- return stream, form, self.cls()
- #: mapping of mimetypes to parsing functions
- parse_functions = {
- "multipart/form-data": _parse_multipart,
- "application/x-www-form-urlencoded": _parse_urlencoded,
- "application/x-url-encoded": _parse_urlencoded,
- }
- def is_valid_multipart_boundary(boundary):
- """Checks if the string given is a valid multipart boundary."""
- return _multipart_boundary_re.match(boundary) is not None
- def _line_parse(line):
- """Removes line ending characters and returns a tuple (`stripped_line`,
- `is_terminated`).
- """
- if line[-2:] in ["\r\n", b"\r\n"]:
- return line[:-2], True
- elif line[-1:] in ["\r", "\n", b"\r", b"\n"]:
- return line[:-1], True
- return line, False
- def parse_multipart_headers(iterable):
- """Parses multipart headers from an iterable that yields lines (including
- the trailing newline symbol). The iterable has to be newline terminated.
- The iterable will stop at the line where the headers ended so it can be
- further consumed.
- :param iterable: iterable of strings that are newline terminated
- """
- result = []
- for line in iterable:
- line = to_native(line)
- line, line_terminated = _line_parse(line)
- if not line_terminated:
- raise ValueError("unexpected end of line in multipart header")
- if not line:
- break
- elif line[0] in " \t" and result:
- key, value = result[-1]
- result[-1] = (key, value + "\n " + line[1:])
- else:
- parts = line.split(":", 1)
- if len(parts) == 2:
- result.append((parts[0].strip(), parts[1].strip()))
- # we link the list to the headers, no need to create a copy, the
- # list was not shared anyways.
- return Headers(result)
- _begin_form = "begin_form"
- _begin_file = "begin_file"
- _cont = "cont"
- _end = "end"
- class MultiPartParser(object):
- def __init__(
- self,
- stream_factory=None,
- charset="utf-8",
- errors="replace",
- max_form_memory_size=None,
- cls=None,
- buffer_size=64 * 1024,
- ):
- self.charset = charset
- self.errors = errors
- self.max_form_memory_size = max_form_memory_size
- self.stream_factory = (
- default_stream_factory if stream_factory is None else stream_factory
- )
- self.cls = MultiDict if cls is None else cls
- # make sure the buffer size is divisible by four so that we can base64
- # decode chunk by chunk
- assert buffer_size % 4 == 0, "buffer size has to be divisible by 4"
- # also the buffer size has to be at least 1024 bytes long or long headers
- # will freak out the system
- assert buffer_size >= 1024, "buffer size has to be at least 1KB"
- self.buffer_size = buffer_size
- def _fix_ie_filename(self, filename):
- """Internet Explorer 6 transmits the full file name if a file is
- uploaded. This function strips the full path if it thinks the
- filename is Windows-like absolute.
- """
- if filename[1:3] == ":\\" or filename[:2] == "\\\\":
- return filename.split("\\")[-1]
- return filename
- def _find_terminator(self, iterator):
- """The terminator might have some additional newlines before it.
- There is at least one application that sends additional newlines
- before headers (the python setuptools package).
- """
- for line in iterator:
- if not line:
- break
- line = line.strip()
- if line:
- return line
- return b""
- def fail(self, message):
- raise ValueError(message)
- def get_part_encoding(self, headers):
- transfer_encoding = headers.get("content-transfer-encoding")
- if (
- transfer_encoding is not None
- and transfer_encoding in _supported_multipart_encodings
- ):
- return transfer_encoding
- def get_part_charset(self, headers):
- # Figure out input charset for current part
- content_type = headers.get("content-type")
- if content_type:
- mimetype, ct_params = parse_options_header(content_type)
- return ct_params.get("charset", self.charset)
- return self.charset
- def start_file_streaming(self, filename, headers, total_content_length):
- if isinstance(filename, bytes):
- filename = filename.decode(self.charset, self.errors)
- filename = self._fix_ie_filename(filename)
- content_type = headers.get("content-type")
- try:
- content_length = int(headers["content-length"])
- except (KeyError, ValueError):
- content_length = 0
- container = self.stream_factory(
- total_content_length=total_content_length,
- filename=filename,
- content_type=content_type,
- content_length=content_length,
- )
- return filename, container
- def in_memory_threshold_reached(self, bytes):
- raise exceptions.RequestEntityTooLarge()
- def validate_boundary(self, boundary):
- if not boundary:
- self.fail("Missing boundary")
- if not is_valid_multipart_boundary(boundary):
- self.fail("Invalid boundary: %s" % boundary)
- if len(boundary) > self.buffer_size: # pragma: no cover
- # this should never happen because we check for a minimum size
- # of 1024 and boundaries may not be longer than 200. The only
- # situation when this happens is for non debug builds where
- # the assert is skipped.
- self.fail("Boundary longer than buffer size")
- def parse_lines(self, file, boundary, content_length, cap_at_buffer=True):
- """Generate parts of
- ``('begin_form', (headers, name))``
- ``('begin_file', (headers, name, filename))``
- ``('cont', bytestring)``
- ``('end', None)``
- Always obeys the grammar
- parts = ( begin_form cont* end |
- begin_file cont* end )*
- """
- next_part = b"--" + boundary
- last_part = next_part + b"--"
- iterator = chain(
- make_line_iter(
- file,
- limit=content_length,
- buffer_size=self.buffer_size,
- cap_at_buffer=cap_at_buffer,
- ),
- _empty_string_iter,
- )
- terminator = self._find_terminator(iterator)
- if terminator == last_part:
- return
- elif terminator != next_part:
- self.fail("Expected boundary at start of multipart data")
- while terminator != last_part:
- headers = parse_multipart_headers(iterator)
- disposition = headers.get("content-disposition")
- if disposition is None:
- self.fail("Missing Content-Disposition header")
- disposition, extra = parse_options_header(disposition)
- transfer_encoding = self.get_part_encoding(headers)
- name = extra.get("name")
- filename = extra.get("filename")
- # if no content type is given we stream into memory. A list is
- # used as a temporary container.
- if filename is None:
- yield _begin_form, (headers, name)
- # otherwise we parse the rest of the headers and ask the stream
- # factory for something we can write in.
- else:
- yield _begin_file, (headers, name, filename)
- buf = b""
- for line in iterator:
- if not line:
- self.fail("unexpected end of stream")
- if line[:2] == b"--":
- terminator = line.rstrip()
- if terminator in (next_part, last_part):
- break
- if transfer_encoding is not None:
- if transfer_encoding == "base64":
- transfer_encoding = "base64_codec"
- try:
- line = codecs.decode(line, transfer_encoding)
- except Exception:
- self.fail("could not decode transfer encoded chunk")
- # we have something in the buffer from the last iteration.
- # this is usually a newline delimiter.
- if buf:
- yield _cont, buf
- buf = b""
- # If the line ends with windows CRLF we write everything except
- # the last two bytes. In all other cases however we write
- # everything except the last byte. If it was a newline, that's
- # fine, otherwise it does not matter because we will write it
- # the next iteration. this ensures we do not write the
- # final newline into the stream. That way we do not have to
- # truncate the stream. However we do have to make sure that
- # if something else than a newline is in there we write it
- # out.
- if line[-2:] == b"\r\n":
- buf = b"\r\n"
- cutoff = -2
- else:
- buf = line[-1:]
- cutoff = -1
- yield _cont, line[:cutoff]
- else: # pragma: no cover
- raise ValueError("unexpected end of part")
- # if we have a leftover in the buffer that is not a newline
- # character we have to flush it, otherwise we will chop of
- # certain values.
- if buf not in (b"", b"\r", b"\n", b"\r\n"):
- yield _cont, buf
- yield _end, None
- def parse_parts(self, file, boundary, content_length):
- """Generate ``('file', (name, val))`` and
- ``('form', (name, val))`` parts.
- """
- in_memory = 0
- for ellt, ell in self.parse_lines(file, boundary, content_length):
- if ellt == _begin_file:
- headers, name, filename = ell
- is_file = True
- guard_memory = False
- filename, container = self.start_file_streaming(
- filename, headers, content_length
- )
- _write = container.write
- elif ellt == _begin_form:
- headers, name = ell
- is_file = False
- container = []
- _write = container.append
- guard_memory = self.max_form_memory_size is not None
- elif ellt == _cont:
- _write(ell)
- # if we write into memory and there is a memory size limit we
- # count the number of bytes in memory and raise an exception if
- # there is too much data in memory.
- if guard_memory:
- in_memory += len(ell)
- if in_memory > self.max_form_memory_size:
- self.in_memory_threshold_reached(in_memory)
- elif ellt == _end:
- if is_file:
- container.seek(0)
- yield (
- "file",
- (name, FileStorage(container, filename, name, headers=headers)),
- )
- else:
- part_charset = self.get_part_charset(headers)
- yield (
- "form",
- (name, b"".join(container).decode(part_charset, self.errors)),
- )
- def parse(self, file, boundary, content_length):
- formstream, filestream = tee(
- self.parse_parts(file, boundary, content_length), 2
- )
- form = (p[1] for p in formstream if p[0] == "form")
- files = (p[1] for p in filestream if p[0] == "file")
- return self.cls(form), self.cls(files)
|