Skip to content

Instantly share code, notes, and snippets.

@gboudreau
Created September 30, 2017 00:46
Show Gist options
  • Save gboudreau/2f74a58adea787a2d0efffd2bd6cd26d to your computer and use it in GitHub Desktop.
Save gboudreau/2f74a58adea787a2d0efffd2bd6cd26d to your computer and use it in GitHub Desktop.
# HG changeset patch
# Parent 60085c8f01fe4eb19a1c38a2d27fd77698b5a5ec
Issue #24363: Add policy flag to avoid parsing HTTP header as email body
diff -r 60085c8f01fe Lib/email/errors.py
--- a/Lib/email/errors.py Thu Sep 08 22:37:34 2016 -0400
+++ b/Lib/email/errors.py Mon Jan 23 23:39:53 2017 +0000
@@ -55,8 +55,9 @@
class MissingHeaderBodySeparatorDefect(MessageDefect):
"""Found line with no leading whitespace and no colon before blank line."""
-# XXX: backward compatibility, just in case (it was never emitted).
-MalformedHeaderDefect = MissingHeaderBodySeparatorDefect
+
+class MalformedHeaderDefect(MessageDefect):
+ """An ordinary header line did not match the expected format."""
class MultipartInvariantViolationDefect(MessageDefect):
"""A message claimed to be a multipart but no subparts were found."""
diff -r 60085c8f01fe Lib/email/feedparser.py
--- a/Lib/email/feedparser.py Thu Sep 08 22:37:34 2016 -0400
+++ b/Lib/email/feedparser.py Mon Jan 23 23:39:53 2017 +0000
@@ -169,6 +169,9 @@
self._last = None
self._headersonly = False
+ # True to parse a HTTP header section that is detached from any body
+ self.__body_detached = getattr(policy, "_py_body_detached", False)
+
# Non-public interface for supporting Parser's headersonly flag
def _set_headersonly(self):
self._headersonly = True
@@ -233,9 +236,18 @@
# (i.e. newline), just throw it away. Otherwise the line is
# part of the body so push it back.
if not NLCRE.match(line):
- defect = errors.MissingHeaderBodySeparatorDefect()
+ if self.__body_detached:
+ defect = "Invalid header line: " + repr(line)
+ defect = errors.MalformedHeaderDefect(defect)
+ else:
+ defect = errors.MissingHeaderBodySeparatorDefect()
+ self._input.unreadline(line)
self.policy.handle_defect(self._cur, defect)
- self._input.unreadline(line)
+ if self.__body_detached:
+ # Even in the case of a blank line, this could be "\r",
+ # which the HTTP parser does not consider to be the last
+ # line, so ignore it in case other header fields follow
+ continue
break
headers.append(line)
# Done with the headers, so parse them and figure out what we're
@@ -322,7 +334,7 @@
lines.append(line)
self._cur.set_payload(EMPTYSTRING.join(lines))
return
- # Make sure a valid content type was specified per RFC 2045:6.4.
+ # Make sure a valid encoding was specified per RFC 2045:6.4.
if (self._cur.get('content-transfer-encoding', '8bit').lower()
not in ('7bit', '8bit', 'binary')):
defect = errors.InvalidMultipartContentTransferEncodingDefect()
@@ -499,7 +511,7 @@
line = line[:-len(mo.group(0))]
self._cur.set_unixfrom(line)
continue
- elif lineno == len(lines) - 1:
+ elif not self.__body_detached and lineno == len(lines) - 1:
# Something looking like a unix-from at the end - it's
# probably the first line of the body, so push back the
# line and stop.
@@ -511,10 +523,13 @@
defect = errors.MisplacedEnvelopeHeaderDefect(line)
self._cur.defects.append(defect)
continue
+
# Split the line on the colon separating field name from value.
- # There will always be a colon, because if there wasn't the part of
- # the parser that calls us would have started parsing the body.
- i = line.find(':')
+ # There will always be a colon, because if there wasn't,
+ # it would have been picked up by the part of
+ # the parser that calls us, or the continuation or envelope
+ # checks above.
+ i = line.index(':')
# If the colon is on the start of the line the header is clearly
# malformed, but we might be able to salvage the rest of the
@@ -524,7 +539,6 @@
self._cur.defects.append(defect)
continue
- assert i>0, "_parse_headers fed line with no : and no leading WS"
lastheader = line[:i]
lastvalue = [line]
# Done with all the lines, so handle the last header.
diff -r 60085c8f01fe Lib/http/client.py
--- a/Lib/http/client.py Thu Sep 08 22:37:34 2016 -0400
+++ b/Lib/http/client.py Mon Jan 23 23:39:53 2017 +0000
@@ -69,6 +69,7 @@
"""
import email.parser
+import email.policy
import email.message
import http
import io
@@ -191,14 +192,16 @@
lst.append(line)
return lst
+class _Policy(email.policy.Compat32):
+ _py_body_detached = True
+
def parse_headers(fp, _class=HTTPMessage):
"""Parses only RFC2822 headers from a file pointer.
- email Parser wants to see strings rather than bytes.
- But a TextIOWrapper around self.rfile would buffer too many bytes
- from the stream, bytes which we later need to read as bytes.
- So we read the correct bytes here, as bytes, for email Parser
- to parse.
+ The parser works with text strings rather than bytes.
+ But a TextIOWrapper may internally buffer too many bytes from the stream,
+ bytes which we later need to read. So we read the correct number of
+ bytes from the stream before decoding them to text to be parsed.
"""
headers = []
@@ -212,7 +215,8 @@
if line in (b'\r\n', b'\n', b''):
break
hstring = b''.join(headers).decode('iso-8859-1')
- return email.parser.Parser(_class=_class).parsestr(hstring)
+ parser = email.parser.Parser(_class=_class, policy=_Policy())
+ return parser.parsestr(hstring)
class HTTPResponse(io.BufferedIOBase):
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment