From d49c99f10d66e6c485bde7e35d79dea07f3c90eb Mon Sep 17 00:00:00 2001
From: "Miss Islington (bot)"
 <31488909+miss-islington@users.noreply.github.com>
Date: Tue, 5 Jul 2022 10:09:51 -0700
Subject: [PATCH] gh-94360: Fix a tokenizer crash when reading encoded files
 with syntax errors from stdin (GH-94386)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* gh-94360: Fix a tokenizer crash when reading encoded files with syntax errors from stdin

Signed-off-by: Pablo Galindo <pablogsal@gmail.com>

* nitty nit

Co-authored-by: Ɓukasz Langa <lukasz@langa.pl>
(cherry picked from commit 36fcde61ba48c4e918830691ecf4092e4e3b9b99)

Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
---
 .../2022-06-28-14-20-36.gh-issue-94360.DiEnen.rst      |  2 ++
 Parser/pegen_errors.c                                  |  8 ++++----
 Parser/tokenizer.c                                     | 10 +++++++++-
 3 files changed, 15 insertions(+), 5 deletions(-)
 create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-06-28-14-20-36.gh-issue-94360.DiEnen.rst

diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-06-28-14-20-36.gh-issue-94360.DiEnen.rst b/Misc/NEWS.d/next/Core and Builtins/2022-06-28-14-20-36.gh-issue-94360.DiEnen.rst
new file mode 100644
index 00000000000..0a74ba38b0a
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-06-28-14-20-36.gh-issue-94360.DiEnen.rst	
@@ -0,0 +1,2 @@
+Fixed a tokenizer crash when reading encoded files with syntax errors from
+``stdin`` with non utf-8 encoded text. Patch by Pablo Galindo
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
index 48969967963..5703088443e 100644
--- a/Parser/pegen_errors.c
+++ b/Parser/pegen_errors.c
@@ -259,15 +259,15 @@ get_error_line_from_tokenizer_buffers(Parser *p, Py_ssize_t lineno)
     const char* buf_end = p->tok->fp_interactive ? p->tok->interactive_src_end : p->tok->inp;
 
     for (int i = 0; i < relative_lineno - 1; i++) {
-        char *new_line = strchr(cur_line, '\n') + 1;
+        char *new_line = strchr(cur_line, '\n');
         // The assert is here for debug builds but the conditional that
         // follows is there so in release builds we do not crash at the cost
         // to report a potentially wrong line.
-        assert(new_line != NULL && new_line <= buf_end);
-        if (new_line == NULL || new_line > buf_end) {
+        assert(new_line != NULL && new_line + 1 < buf_end);
+        if (new_line == NULL || new_line + 1 > buf_end) {
             break;
         }
-        cur_line = new_line;
+        cur_line = new_line + 1;
     }
 
     char *next_newline;
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index c450aa8e463..b61ac120c86 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -305,6 +305,10 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
 
     Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
     Py_ssize_t line_size = strlen(line);
+    char last_char = line[line_size > 0 ? line_size - 1 : line_size];
+    if (last_char != '\n') {
+        line_size += 1;
+    }
     char* new_str = tok->interactive_src_start;
 
     new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
@@ -318,7 +322,11 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
         return -1;
     }
     strcpy(new_str + current_size, line);
-
+    if (last_char != '\n') {
+        /* Last line does not end in \n, fake one */
+        new_str[current_size + line_size - 1] = '\n';
+        new_str[current_size + line_size] = '\0';
+    }
     tok->interactive_src_start = new_str;
     tok->interactive_src_end = new_str + current_size + line_size;
     return 0;
-- 
GitLab