From 05692c67c51b78a5a5a7bb61d646519025e38015 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 6 Sep 2022 19:12:16 -0400
Subject: [PATCH] gh-96611: Fix error message for invalid UTF-8 in
 mid-multiline string (#96623)

---
 Lib/test/test_source_encoding.py                     | 12 ++++++++++++
 .../2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst    |  2 ++
 Parser/tokenizer.c                                   |  2 ++
 3 files changed, 16 insertions(+)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst

diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index 8e68b4eae33..feaff4770f7 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -147,6 +147,18 @@ def test_error_from_string(self):
         self.assertTrue(c.exception.args[0].startswith(expected),
                         msg=c.exception.args[0])
 
+    def test_file_parse_error_multiline(self):
+        # gh96611:
+        with open(TESTFN, "wb") as fd:
+            fd.write(b'print("""\n\xb1""")\n')
+
+        try:
+            retcode, stdout, stderr = script_helper.assert_python_failure(TESTFN)
+
+            self.assertGreater(retcode, 0)
+            self.assertIn(b"Non-UTF-8 code starting with '\\xb1'", stderr)
+        finally:
+            os.unlink(TESTFN)
 
 class AbstractSourceEncodingTest:
 
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst b/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst
new file mode 100644
index 00000000000..08bd409bc9f
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-09-06-16-22-13.gh-issue-96611.14wIX8.rst	
@@ -0,0 +1,2 @@
+When loading a file with invalid UTF-8 inside a multi-line string, a correct
+SyntaxError is emitted.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index f2606f17d14..6d08db5ebd5 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1936,6 +1936,8 @@ tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
         /* Get rest of string */
         while (end_quote_size != quote_size) {
             c = tok_nextc(tok);
+            if (tok->done == E_DECODE)
+                break;
             if (c == EOF || (quote_size == 1 && c == '\n')) {
                 assert(tok->multi_line_start != NULL);
                 // shift the tok_state's location into
-- 
GitLab