[Rust,front-end,v2,09/37] gccrs: Add Lexer for Rust front-end
Commit Message
From: The Other <simplytheother@gmail.com>
The lexer is refered to as a ManagedTokenSource within the parser, this
lexer does not currently support unicode but serves as a starting point
to do so.
Co-authored-by: Philip Herron <philip.herron@embecosm.com>
Co-authored-by: Arthur Cohen <arthur.cohen@embecosm.com>
Co-authored-by: Mark Wielaard <mark@klomp.org>
---
gcc/rust/lex/rust-codepoint.h | 46 +
gcc/rust/lex/rust-lex.cc | 2729 ++++++++++++++++++++++++++++++++
gcc/rust/lex/rust-lex.h | 271 ++++
gcc/rust/lex/rust-token.cc | 135 ++
gcc/rust/lex/rust-token.h | 455 ++++++
gcc/rust/rust-buffered-queue.h | 204 +++
6 files changed, 3840 insertions(+)
create mode 100644 gcc/rust/lex/rust-codepoint.h
create mode 100644 gcc/rust/lex/rust-lex.cc
create mode 100644 gcc/rust/lex/rust-lex.h
create mode 100644 gcc/rust/lex/rust-token.cc
create mode 100644 gcc/rust/lex/rust-token.h
create mode 100644 gcc/rust/rust-buffered-queue.h
Comments
On Wed, Aug 24, 2022 at 2:04 PM <herron.philip@googlemail.com> wrote:
>
> From: The Other <simplytheother@gmail.com>
>
> The lexer is refered to as a ManagedTokenSource within the parser, this
> lexer does not currently support unicode but serves as a starting point
> to do so.
>
> Co-authored-by: Philip Herron <philip.herron@embecosm.com>
> Co-authored-by: Arthur Cohen <arthur.cohen@embecosm.com>
> Co-authored-by: Mark Wielaard <mark@klomp.org>
> ---
> gcc/rust/lex/rust-codepoint.h | 46 +
> gcc/rust/lex/rust-lex.cc | 2729 ++++++++++++++++++++++++++++++++
> gcc/rust/lex/rust-lex.h | 271 ++++
> gcc/rust/lex/rust-token.cc | 135 ++
> gcc/rust/lex/rust-token.h | 455 ++++++
> gcc/rust/rust-buffered-queue.h | 204 +++
> 6 files changed, 3840 insertions(+)
> create mode 100644 gcc/rust/lex/rust-codepoint.h
> create mode 100644 gcc/rust/lex/rust-lex.cc
> create mode 100644 gcc/rust/lex/rust-lex.h
> create mode 100644 gcc/rust/lex/rust-token.cc
> create mode 100644 gcc/rust/lex/rust-token.h
> create mode 100644 gcc/rust/rust-buffered-queue.h
>
> diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/lex/rust-codepoint.h
> new file mode 100644
> index 00000000000..22da080bbb2
> --- /dev/null
> +++ b/gcc/rust/lex/rust-codepoint.h
> @@ -0,0 +1,46 @@
> +// Copyright (C) 2020-2022 Free Software Foundation, Inc.
> +
> +// This file is part of GCC.
> +
> +// GCC is free software; you can redistribute it and/or modify it under
> +// the terms of the GNU General Public License as published by the Free
> +// Software Foundation; either version 3, or (at your option) any later
> +// version.
> +
> +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +// WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> +// for more details.
> +
> +// You should have received a copy of the GNU General Public License
> +// along with GCC; see the file COPYING3. If not see
> +// <http://www.gnu.org/licenses/>.
> +
> +#ifndef RUST_CODEPOINT_H
> +#define RUST_CODEPOINT_H
> +
> +#include <string>
> +
> +namespace Rust {
> +struct Codepoint
> +{
> + uint32_t value;
> +
> + // Creates a zero codepoint.
> + Codepoint () : value (0) {}
> +
> + // Creates a codepoint from an encoded UTF-8 value.
> + Codepoint (uint32_t value) : value (value) {}
> +
> + static Codepoint eof () { return Codepoint (UINT32_MAX); }
> + bool is_eof () const { return value == UINT32_MAX; }
> +
> + // Returns a C++ string containing string value of codepoint.
> + std::string as_string ();
> +
> + bool operator== (Codepoint other) const { return value == other.value; }
> + bool operator!= (Codepoint other) const { return !operator== (other); }
> +};
> +} // namespace Rust
> +
> +#endif
> diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
> new file mode 100644
> index 00000000000..70e6b50209f
> --- /dev/null
> +++ b/gcc/rust/lex/rust-lex.cc
> @@ -0,0 +1,2729 @@
> +// Copyright (C) 2020-2022 Free Software Foundation, Inc.
> +
> +// This file is part of GCC.
> +
> +// GCC is free software; you can redistribute it and/or modify it under
> +// the terms of the GNU General Public License as published by the Free
> +// Software Foundation; either version 3, or (at your option) any later
> +// version.
> +
> +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +// WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> +// for more details.
> +
> +// You should have received a copy of the GNU General Public License
> +// along with GCC; see the file COPYING3. If not see
> +// <http://www.gnu.org/licenses/>.
> +
> +#include "rust-lex.h"
> +
> +#include "rust-system.h" // for rust_assert and rust_unreachable
> +#include "rust-diagnostics.h" // for rust_error_at
> +#include "rust-linemap.h"
> +#include "rust-session-manager.h"
> +#include "safe-ctype.h"
just diving into a random patch here - I'm assuming I can take rust-lex.cc as
a boiler-plate example for the #include structure.
In GCC all files should start with #including "config.h" followed by
"system.h" where _all_ system, including C++ standard library headers
should be pulled via system.h to allow working around OS and system
compiler issues.
It might be that rust-system.h plays the role of config.h + system.h
but then the rust-lex.h include is before it.
rust-codepoint.h including <string> is also problematic btw.
Richard.
> +namespace Rust {
> +// TODO: move to separate compilation unit?
> +// overload += for uint32_t to allow 32-bit encoded utf-8 to be added
> +std::string &
> +operator+= (std::string &str, Codepoint char32)
> +{
> + if (char32.value < 0x80)
> + {
> + str += static_cast<char> (char32.value);
> + }
> + else if (char32.value < (0x1F + 1) << (1 * 6))
> + {
> + str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
> + str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
> + }
> + else if (char32.value < (0x0F + 1) << (2 * 6))
> + {
> + str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
> + str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
> + str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
> + }
> + else if (char32.value < (0x07 + 1) << (3 * 6))
> + {
> + str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
> + str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
> + str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
> + str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
> + }
> + else
> + {
> + rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value);
> + }
> + return str;
> +}
> +
> +std::string
> +Codepoint::as_string ()
> +{
> + std::string str;
> +
> + // str += Codepoint (value);
> + str += *this;
> +
> + return str;
> +}
> +
> +/* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
> + * for handling. */
> +bool
> +is_float_digit (char number)
> +{
> + return ISDIGIT (number) || number == 'E' || number == 'e';
> +}
> +
> +/* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
> + * whatever is different */
> +bool
> +is_x_digit (char number)
> +{
> + return ISXDIGIT (number);
> +}
> +
> +bool
> +is_octal_digit (char number)
> +{
> + return number >= '0' && number <= '7';
> +}
> +
> +bool
> +is_bin_digit (char number)
> +{
> + return number == '0' || number == '1';
> +}
> +
> +bool
> +check_valid_float_dot_end (char character)
> +{
> + return character != '.' && character != '_' && !ISALPHA (character);
> +}
> +
> +// ISSPACE from safe-ctype but may change in future
> +bool
> +is_whitespace (char character)
> +{
> + return ISSPACE (character);
> +}
> +
> +bool
> +is_non_decimal_int_literal_separator (char character)
> +{
> + return character == 'x' || character == 'o' || character == 'b';
> +}
> +
> +Lexer::Lexer (const std::string &input)
> + : input (RAIIFile::create_error ()), current_line (1), current_column (1),
> + line_map (nullptr), raw_input_source (new BufferInputSource (input, 0)),
> + input_queue{*raw_input_source}, token_queue (TokenSource (this))
> +{}
> +
> +Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap)
> + : input (std::move (file_input)), current_line (1), current_column (1),
> + line_map (linemap),
> + raw_input_source (new FileInputSource (input.get_raw ())),
> + input_queue{*raw_input_source}, token_queue (TokenSource (this))
> +{
> + // inform line_table that file is being entered and is in line 1
> + if (linemap)
> + line_map->start_file (filename, current_line);
> +}
> +
> +Lexer::~Lexer ()
> +{
> + /* ok apparently stop (which is equivalent of original code in destructor) is
> + * meant to be called after all files have finished parsing, for cleanup. On
> + * the other hand, actual code that it calls to leave a certain line map is
> + * mentioned in GCC docs as being useful for "just leaving an included header"
> + * and stuff like that, so this line mapping functionality may need fixing.
> + * FIXME: find out whether this occurs. */
> +
> + // line_map->stop();
> +}
> +
> +/* TODO: need to optimise somehow to avoid the virtual function call in the
> + * tight loop. Best idea at the moment is CRTP, but that might make lexer
> + * implementation annoying when storing the "base class" (i.e. would need
> + * template parameter everywhere), although in practice it would mostly just
> + * look ugly and make enclosing classes like Parser also require a type
> + * parameter. At this point a macro might be better. OK I guess macros can be
> + * replaced by constexpr if or something if possible. */
> +Location
> +Lexer::get_current_location ()
> +{
> + if (line_map)
> + return line_map->get_location (current_column);
> + else
> + // If we have no linemap, we're lexing something without proper locations
> + return Location ();
> +}
> +
> +int
> +Lexer::peek_input (int n)
> +{
> + return input_queue.peek (n);
> +}
> +
> +int
> +Lexer::peek_input ()
> +{
> + return peek_input (0);
> +}
> +
> +void
> +Lexer::skip_input (int n)
> +{
> + input_queue.skip (n);
> +}
> +
> +void
> +Lexer::skip_input ()
> +{
> + skip_input (0);
> +}
> +
> +void
> +Lexer::replace_current_token (TokenPtr replacement)
> +{
> + token_queue.replace_current_value (replacement);
> +
> + rust_debug ("called 'replace_current_token' - this is deprecated");
> +}
> +
> +/* shitty anonymous namespace that can only be accessed inside the compilation
> + * unit - used for classify_keyword binary search in sorted array of keywords
> + * created with x-macros. */
> +namespace {
> +// TODO: make constexpr when update to c++20
> +const std::string keyword_index[] = {
> +#define RS_TOKEN(x, y)
> +#define RS_TOKEN_KEYWORD(name, keyword) keyword,
> + RS_TOKEN_LIST
> +#undef RS_TOKEN_KEYWORD
> +#undef RS_TOKEN
> +};
> +
> +constexpr TokenId keyword_keys[] = {
> +#define RS_TOKEN(x, y)
> +#define RS_TOKEN_KEYWORD(name, keyword) name,
> + RS_TOKEN_LIST
> +#undef RS_TOKEN_KEYWORD
> +#undef RS_TOKEN
> +};
> +
> +constexpr int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index);
> +} // namespace
> +
> +/* Determines whether the string passed in is a keyword or not. If it is, it
> + * returns the keyword name. */
> +TokenId
> +Lexer::classify_keyword (const std::string &str)
> +{
> + const std::string *last = keyword_index + num_keywords;
> + const std::string *idx = std::lower_bound (keyword_index, last, str);
> +
> + if (idx == last || str != *idx)
> + return IDENTIFIER;
> +
> + // TODO: possibly replace this x-macro system with something like hash map?
> +
> + // We now have the expected token ID of the reserved keyword. However, some
> + // keywords are reserved starting in certain editions. For example, `try` is
> + // only a reserved keyword in editions >=2018. The language might gain new
> + // reserved keywords in the future.
> + //
> + // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
> + auto id = keyword_keys[idx - keyword_index];
> +
> + // `try` is not a reserved keyword before 2018
> + if (Session::get_instance ().options.get_edition ()
> + == CompileOptions::Edition::E2015
> + && id == TRY)
> + return IDENTIFIER;
> +
> + return id;
> +}
> +
> +TokenPtr
> +Lexer::build_token ()
> +{
> + // loop to go through multiple characters to build a single token
> + while (true)
> + {
> + Location loc = get_current_location ();
> + current_char = peek_input ();
> + skip_input ();
> +
> + // detect UTF8 bom
> + //
> + // Must be the first thing on the first line.
> + // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
> + // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
> + if (current_line == 1 && current_column == 1 && current_char == 0xef
> + && peek_input () == 0xbb && peek_input (1) == 0xbf)
> + {
> + skip_input (1);
> + current_char = peek_input ();
> + skip_input ();
> + }
> +
> + // detect shebang
> + // Must be the first thing on the first line, starting with #!
> + // But since an attribute can also start with an #! we don't count it as a
> + // shebang line when after any whitespace or comments there is a [. If it
> + // is a shebang line we simple drop the line. Otherwise we don't consume
> + // any characters and fall through to the real tokenizer.
> + if (current_line == 1 && current_column == 1 && current_char == '#'
> + && peek_input () == '!')
> + {
> + int n = 1;
> + while (true)
> + {
> + int next_char = peek_input (n);
> + if (is_whitespace (next_char))
> + n++;
> + else if ((next_char == '/' && peek_input (n + 1) == '/'
> + && peek_input (n + 2) != '!'
> + && peek_input (n + 2) != '/')
> + || (next_char == '/' && peek_input (n + 1) == '/'
> + && peek_input (n + 2) == '/'
> + && peek_input (n + 3) == '/'))
> + {
> + // two // or four ////
> + // A single line comment
> + // (but not an inner or outer doc comment)
> + n += 2;
> + next_char = peek_input (n);
> + while (next_char != '\n' && next_char != EOF)
> + {
> + n++;
> + next_char = peek_input (n);
> + }
> + if (next_char == '\n')
> + n++;
> + }
> + else if (next_char == '/' && peek_input (n + 1) == '*'
> + && peek_input (n + 2) == '*'
> + && peek_input (n + 3) == '/')
> + {
> + /**/
> + n += 4;
> + }
> + else if (next_char == '/' && peek_input (n + 1) == '*'
> + && peek_input (n + 2) == '*' && peek_input (n + 3) == '*'
> + && peek_input (n + 4) == '/')
> + {
> + /***/
> + n += 5;
> + }
> + else if ((next_char == '/' && peek_input (n + 1) == '*'
> + && peek_input (n + 2) != '*'
> + && peek_input (n + 2) != '!')
> + || (next_char == '/' && peek_input (n + 1) == '*'
> + && peek_input (n + 2) == '*'
> + && peek_input (n + 3) == '*'))
> + {
> + // one /* or three /***
> + // Start of a block comment
> + // (but not an inner or outer doc comment)
> + n += 2;
> + int level = 1;
> + while (level > 0)
> + {
> + if (peek_input (n) == EOF)
> + break;
> + else if (peek_input (n) == '/'
> + && peek_input (n + 1) == '*')
> + {
> + n += 2;
> + level += 1;
> + }
> + else if (peek_input (n) == '*'
> + && peek_input (n + 1) == '/')
> + {
> + n += 2;
> + level -= 1;
> + }
> + else
> + n++;
> + }
> + }
> + else if (next_char != '[')
> + {
> + // definitely shebang, ignore the first line
> + while (current_char != '\n' && current_char != EOF)
> + {
> + current_char = peek_input ();
> + skip_input ();
> + }
> +
> + // newline
> + current_line++;
> + current_column = 1;
> + // tell line_table that new line starts
> + start_line (current_line, max_column_hint);
> + break;
> + }
> + else
> + break; /* Definitely not a shebang line. */
> + }
> + }
> +
> + // return end of file token if end of file
> + if (current_char == EOF)
> + return Token::make (END_OF_FILE, loc);
> +
> + // if not end of file, start tokenising
> + switch (current_char)
> + {
> + /* ignore whitespace characters for tokens but continue updating
> + * location */
> + case '\n': // newline
> + current_line++;
> + current_column = 1;
> + // tell line_table that new line starts
> + start_line (current_line, max_column_hint);
> + continue;
> + case '\r': // cr
> + // Ignore, we expect a newline (lf) soon.
> + continue;
> + case ' ': // space
> + current_column++;
> + continue;
> + case '\t': // tab
> + // width of a tab is not well-defined, assume 8 spaces
> + current_column += 8;
> + continue;
> +
> + // punctuation - actual tokens
> + case '=':
> + if (peek_input () == '>')
> + {
> + // match arm arrow
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (MATCH_ARROW, loc);
> + }
> + else if (peek_input () == '=')
> + {
> + // equality operator
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (EQUAL_EQUAL, loc);
> + }
> + else
> + {
> + // assignment operator
> + current_column++;
> + return Token::make (EQUAL, loc);
> + }
> + case '(':
> + current_column++;
> + return Token::make (LEFT_PAREN, loc);
> + case '-':
> + if (peek_input () == '>')
> + {
> + // return type specifier
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (RETURN_TYPE, loc);
> + }
> + else if (peek_input () == '=')
> + {
> + // minus-assign
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (MINUS_EQ, loc);
> + }
> + else
> + {
> + // minus
> + current_column++;
> + return Token::make (MINUS, loc);
> + }
> + case '+':
> + if (peek_input () == '=')
> + {
> + // add-assign
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (PLUS_EQ, loc);
> + }
> + else
> + {
> + // add
> + current_column++;
> + return Token::make (PLUS, loc);
> + }
> + case ')':
> + current_column++;
> + return Token::make (RIGHT_PAREN, loc);
> + case ';':
> + current_column++;
> + return Token::make (SEMICOLON, loc);
> + case '*':
> + if (peek_input () == '=')
> + {
> + // multiplication-assign
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (ASTERISK_EQ, loc);
> + }
> + else
> + {
> + // multiplication
> + current_column++;
> + return Token::make (ASTERISK, loc);
> + }
> + case ',':
> + current_column++;
> + return Token::make (COMMA, loc);
> + case '/':
> + if (peek_input () == '=')
> + {
> + // division-assign
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (DIV_EQ, loc);
> + }
> + else if ((peek_input () == '/' && peek_input (1) != '!'
> + && peek_input (1) != '/')
> + || (peek_input () == '/' && peek_input (1) == '/'
> + && peek_input (2) == '/'))
> + {
> + // two // or four ////
> + // single line comment
> + // (but not an inner or outer doc comment)
> + skip_input ();
> + current_column += 2;
> + current_char = peek_input ();
> +
> + // basically ignore until line finishes
> + while (current_char != '\n' && current_char != EOF)
> + {
> + skip_input ();
> + current_column++; // not used
> + current_char = peek_input ();
> + }
> + continue;
> + }
> + else if (peek_input () == '/'
> + && (peek_input (1) == '!' || peek_input (1) == '/'))
> + {
> + /* single line doc comment, inner or outer. */
> + bool is_inner = peek_input (1) == '!';
> + skip_input (1);
> + current_column += 3;
> +
> + std::string str;
> + str.reserve (32);
> + current_char = peek_input ();
> + while (current_char != '\n')
> + {
> + skip_input ();
> + if (current_char == '\r')
> + {
> + char next_char = peek_input ();
> + if (next_char == '\n')
> + {
> + current_char = '\n';
> + break;
> + }
> + rust_error_at (
> + loc, "Isolated CR %<\\r%> not allowed in doc comment");
> + current_char = next_char;
> + continue;
> + }
> + if (current_char == EOF)
> + {
> + rust_error_at (
> + loc, "unexpected EOF while looking for end of comment");
> + break;
> + }
> + str += current_char;
> + current_char = peek_input ();
> + }
> + skip_input ();
> + current_line++;
> + current_column = 1;
> + // tell line_table that new line starts
> + start_line (current_line, max_column_hint);
> +
> + str.shrink_to_fit ();
> + if (is_inner)
> + return Token::make_inner_doc_comment (loc, std::move (str));
> + else
> + return Token::make_outer_doc_comment (loc, std::move (str));
> + }
> + else if (peek_input () == '*' && peek_input (1) == '*'
> + && peek_input (2) == '/')
> + {
> + /**/
> + skip_input (2);
> + current_column += 4;
> + continue;
> + }
> + else if (peek_input () == '*' && peek_input (1) == '*'
> + && peek_input (2) == '*' && peek_input (3) == '/')
> + {
> + /***/
> + skip_input (3);
> + current_column += 5;
> + continue;
> + }
> + else if ((peek_input () == '*' && peek_input (1) != '!'
> + && peek_input (1) != '*')
> + || (peek_input () == '*' && peek_input (1) == '*'
> + && peek_input (2) == '*'))
> + {
> + // one /* or three /***
> + // block comment
> + // (but not an inner or outer doc comment)
> + skip_input ();
> + current_column += 2;
> +
> + int level = 1;
> + while (level > 0)
> + {
> + current_char = peek_input ();
> +
> + if (current_char == EOF)
> + {
> + rust_error_at (
> + loc, "unexpected EOF while looking for end of comment");
> + break;
> + }
> +
> + // if /* found
> + if (current_char == '/' && peek_input (1) == '*')
> + {
> + // skip /* characters
> + skip_input (1);
> +
> + current_column += 2;
> +
> + level += 1;
> + continue;
> + }
> +
> + // ignore until */ is found
> + if (current_char == '*' && peek_input (1) == '/')
> + {
> + // skip */ characters
> + skip_input (1);
> +
> + current_column += 2;
> +
> + level -= 1;
> + continue;
> + }
> +
> + if (current_char == '\n')
> + {
> + skip_input ();
> + current_line++;
> + current_column = 1;
> + // tell line_table that new line starts
> + start_line (current_line, max_column_hint);
> + continue;
> + }
> +
> + skip_input ();
> + current_column++;
> + }
> +
> + // refresh new token
> + continue;
> + }
> + else if (peek_input () == '*'
> + && (peek_input (1) == '!' || peek_input (1) == '*'))
> + {
> + // block doc comment, inner /*! or outer /**
> + bool is_inner = peek_input (1) == '!';
> + skip_input (1);
> + current_column += 3;
> +
> + std::string str;
> + str.reserve (96);
> +
> + int level = 1;
> + while (level > 0)
> + {
> + current_char = peek_input ();
> +
> + if (current_char == EOF)
> + {
> + rust_error_at (
> + loc, "unexpected EOF while looking for end of comment");
> + break;
> + }
> +
> + // if /* found
> + if (current_char == '/' && peek_input (1) == '*')
> + {
> + // skip /* characters
> + skip_input (1);
> + current_column += 2;
> +
> + level += 1;
> + str += "/*";
> + continue;
> + }
> +
> + // ignore until */ is found
> + if (current_char == '*' && peek_input (1) == '/')
> + {
> + // skip */ characters
> + skip_input (1);
> + current_column += 2;
> +
> + level -= 1;
> + if (level > 0)
> + str += "*/";
> + continue;
> + }
> +
> + if (current_char == '\r' && peek_input (1) != '\n')
> + rust_error_at (
> + loc, "Isolated CR %<\\r%> not allowed in doc comment");
> +
> + if (current_char == '\n')
> + {
> + skip_input ();
> + current_line++;
> + current_column = 1;
> + // tell line_table that new line starts
> + start_line (current_line, max_column_hint);
> + str += '\n';
> + continue;
> + }
> +
> + str += current_char;
> + skip_input ();
> + current_column++;
> + }
> +
> + str.shrink_to_fit ();
> + if (is_inner)
> + return Token::make_inner_doc_comment (loc, std::move (str));
> + else
> + return Token::make_outer_doc_comment (loc, std::move (str));
> + }
> + else
> + {
> + // division
> + current_column++;
> + return Token::make (DIV, loc);
> + }
> + case '%':
> + if (peek_input () == '=')
> + {
> + // modulo-assign
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (PERCENT_EQ, loc);
> + }
> + else
> + {
> + // modulo
> + current_column++;
> + return Token::make (PERCENT, loc);
> + }
> + case '^':
> + if (peek_input () == '=')
> + {
> + // xor-assign?
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (CARET_EQ, loc);
> + }
> + else
> + {
> + // xor?
> + current_column++;
> + return Token::make (CARET, loc);
> + }
> + case '<':
> + if (peek_input () == '<')
> + {
> + if (peek_input (1) == '=')
> + {
> + // left-shift assign
> + skip_input (1);
> + current_column += 3;
> +
> + return Token::make (LEFT_SHIFT_EQ, loc);
> + }
> + else
> + {
> + // left-shift
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (LEFT_SHIFT, loc);
> + }
> + }
> + else if (peek_input () == '=')
> + {
> + // smaller than or equal to
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (LESS_OR_EQUAL, loc);
> + }
> + else
> + {
> + // smaller than
> + current_column++;
> + return Token::make (LEFT_ANGLE, loc);
> + }
> + break;
> + case '>':
> + if (peek_input () == '>')
> + {
> + if (peek_input (1) == '=')
> + {
> + // right-shift-assign
> + skip_input (1);
> + current_column += 3;
> +
> + return Token::make (RIGHT_SHIFT_EQ, loc);
> + }
> + else
> + {
> + // right-shift
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (RIGHT_SHIFT, loc);
> + }
> + }
> + else if (peek_input () == '=')
> + {
> + // larger than or equal to
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (GREATER_OR_EQUAL, loc);
> + }
> + else
> + {
> + // larger than
> + current_column++;
> + return Token::make (RIGHT_ANGLE, loc);
> + }
> + case ':':
> + if (peek_input () == ':')
> + {
> + // scope resolution ::
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (SCOPE_RESOLUTION, loc);
> + }
> + else
> + {
> + // single colon :
> + current_column++;
> + return Token::make (COLON, loc);
> + }
> + case '!':
> + // no special handling for macros in lexer?
> + if (peek_input () == '=')
> + {
> + // not equal boolean operator
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (NOT_EQUAL, loc);
> + }
> + else
> + {
> + // not equal unary operator
> + current_column++;
> +
> + return Token::make (EXCLAM, loc);
> + }
> + case '?':
> + current_column++;
> + return Token::make (QUESTION_MARK, loc);
> + case '#':
> + current_column++;
> + return Token::make (HASH, loc);
> + case '[':
> + current_column++;
> + return Token::make (LEFT_SQUARE, loc);
> + case ']':
> + current_column++;
> + return Token::make (RIGHT_SQUARE, loc);
> + case '{':
> + current_column++;
> + return Token::make (LEFT_CURLY, loc);
> + case '}':
> + current_column++;
> + return Token::make (RIGHT_CURLY, loc);
> + case '@':
> + current_column++;
> + return Token::make (PATTERN_BIND, loc);
> + case '$':
> + current_column++;
> + return Token::make (DOLLAR_SIGN, loc);
> + case '~':
> + current_column++;
> + return Token::make (TILDE, loc);
> + case '\\':
> + current_column++;
> + return Token::make (BACKSLASH, loc);
> + case '`':
> + current_column++;
> + return Token::make (BACKTICK, loc);
> + case '|':
> + if (peek_input () == '=')
> + {
> + // bitwise or-assign?
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (PIPE_EQ, loc);
> + }
> + else if (peek_input () == '|')
> + {
> + // logical or
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (OR, loc);
> + }
> + else
> + {
> + // bitwise or
> + current_column++;
> +
> + return Token::make (PIPE, loc);
> + }
> + case '&':
> + if (peek_input () == '=')
> + {
> + // bitwise and-assign?
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (AMP_EQ, loc);
> + }
> + else if (peek_input () == '&')
> + {
> + // logical and
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (LOGICAL_AND, loc);
> + }
> + else
> + {
> + // bitwise and/reference
> + current_column++;
> +
> + return Token::make (AMP, loc);
> + }
> + case '.':
> + if (peek_input () == '.')
> + {
> + if (peek_input (1) == '.')
> + {
> + // ellipsis
> + skip_input (1);
> + current_column += 3;
> +
> + return Token::make (ELLIPSIS, loc);
> + }
> + else if (peek_input (1) == '=')
> + {
> + // ..=
> + skip_input (1);
> + current_column += 3;
> +
> + return Token::make (DOT_DOT_EQ, loc);
> + }
> + else
> + {
> + // ..
> + skip_input ();
> + current_column += 2;
> +
> + return Token::make (DOT_DOT, loc);
> + }
> + }
> + else /*if (!ISDIGIT (peek_input ()))*/
> + {
> + // single dot .
> + // Only if followed by a non-number - otherwise is float
> + // nope, float cannot start with '.'.
> + current_column++;
> + return Token::make (DOT, loc);
> + }
> + }
> + // TODO: special handling of _ in the lexer? instead of being identifier
> +
> + // byte character, byte string and raw byte string literals
> + if (current_char == 'b')
> + {
> + if (peek_input () == '\'')
> + return parse_byte_char (loc);
> + else if (peek_input () == '"')
> + return parse_byte_string (loc);
> + else if (peek_input () == 'r'
> + && (peek_input (1) == '#' || peek_input (1) == '"'))
> + return parse_raw_byte_string (loc);
> + }
> +
> + // raw identifiers and raw strings
> + if (current_char == 'r')
> + {
> + int peek = peek_input ();
> + int peek1 = peek_input (1);
> +
> + if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
> + {
> + TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
> + if (raw_ident_ptr != nullptr)
> + return raw_ident_ptr;
> + else
> + continue; /* input got parsed, it just wasn't valid. An error
> + was produced. */
> + }
> + else
> + {
> + TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
> + if (maybe_raw_string_ptr != nullptr)
> + return maybe_raw_string_ptr;
> + }
> + }
> +
> + // find identifiers and keywords
> + if (ISALPHA (current_char) || current_char == '_')
> + return parse_identifier_or_keyword (loc);
> +
> + // int and float literals
> + if (ISDIGIT (current_char))
> + { // _ not allowed as first char
> + if (current_char == '0'
> + && is_non_decimal_int_literal_separator (peek_input ()))
> + {
> + // handle binary, octal, hex literals
> + TokenPtr non_dec_int_lit_ptr
> + = parse_non_decimal_int_literals (loc);
> + if (non_dec_int_lit_ptr != nullptr)
> + return non_dec_int_lit_ptr;
> + }
> + else
> + {
> + // handle decimals (integer or float)
> + TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
> + if (decimal_or_float_ptr != nullptr)
> + return decimal_or_float_ptr;
> + }
> + }
> +
> + // string literals
> + if (current_char == '"')
> + return parse_string (loc);
> +
> + // char literals and lifetime names
> + if (current_char == '\'')
> + {
> + TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
> + if (char_or_lifetime_ptr != nullptr)
> + return char_or_lifetime_ptr;
> + }
> +
> + // DEBUG: check for specific character problems:
> + if (current_char == '0')
> + rust_debug ("'0' uncaught before unexpected character");
> + else if (current_char == ']')
> + rust_debug ("']' uncaught before unexpected character");
> + else if (current_char == 0x5d)
> + rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
> + "unexpected character");
> +
> + // didn't match anything so error
> + rust_error_at (loc, "unexpected character %<%x%>", current_char);
> + current_column++;
> + }
> +}
> +
> +// Parses in a type suffix.
> +std::pair<PrimitiveCoreType, int>
> +Lexer::parse_in_type_suffix ()
> +{
> + std::string suffix;
> + suffix.reserve (5);
> +
> + int additional_length_offset = 0;
> +
> + // get suffix
> + while (ISALPHA (current_char) || ISDIGIT (current_char)
> + || current_char == '_')
> + {
> + if (current_char == '_')
> + {
> + // don't add _ to suffix
> + skip_input ();
> + current_char = peek_input ();
> +
> + additional_length_offset++;
> +
> + continue;
> + }
> +
> + additional_length_offset++;
> +
> + suffix += current_char;
> + skip_input ();
> + current_char = peek_input ();
> + }
> +
> + if (suffix.empty ())
> + {
> + // no type suffix: do nothing but also no error
> + return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
> + }
> + else if (suffix == "f32")
> + {
> + return std::make_pair (CORETYPE_F32, additional_length_offset);
> + }
> + else if (suffix == "f64")
> + {
> + return std::make_pair (CORETYPE_F64, additional_length_offset);
> + }
> + else if (suffix == "i8")
> + {
> + return std::make_pair (CORETYPE_I8, additional_length_offset);
> + }
> + else if (suffix == "i16")
> + {
> + return std::make_pair (CORETYPE_I16, additional_length_offset);
> + }
> + else if (suffix == "i32")
> + {
> + return std::make_pair (CORETYPE_I32, additional_length_offset);
> + }
> + else if (suffix == "i64")
> + {
> + return std::make_pair (CORETYPE_I64, additional_length_offset);
> + }
> + else if (suffix == "i128")
> + {
> + return std::make_pair (CORETYPE_I128, additional_length_offset);
> + }
> + else if (suffix == "isize")
> + {
> + return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
> + }
> + else if (suffix == "u8")
> + {
> + return std::make_pair (CORETYPE_U8, additional_length_offset);
> + }
> + else if (suffix == "u16")
> + {
> + return std::make_pair (CORETYPE_U16, additional_length_offset);
> + }
> + else if (suffix == "u32")
> + {
> + return std::make_pair (CORETYPE_U32, additional_length_offset);
> + }
> + else if (suffix == "u64")
> + {
> + return std::make_pair (CORETYPE_U64, additional_length_offset);
> + }
> + else if (suffix == "u128")
> + {
> + return std::make_pair (CORETYPE_U128, additional_length_offset);
> + }
> + else if (suffix == "usize")
> + {
> + return std::make_pair (CORETYPE_USIZE, additional_length_offset);
> + }
> + else
> + {
> + rust_error_at (get_current_location (), "unknown number suffix %qs",
> + suffix.c_str ());
> +
> + return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
> + }
> +}
> +
> +// Parses in the exponent part (if any) of a float literal.
> +std::pair<std::string, int>
> +Lexer::parse_in_exponent_part ()
> +{
> + int additional_length_offset = 0;
> + std::string str;
> + if (current_char == 'E' || current_char == 'e')
> + {
> + // add exponent to string as strtod works with it
> + str += current_char;
> + skip_input ();
> + current_char = peek_input ();
> +
> + additional_length_offset++;
> +
> + // special - and + handling
> + if (current_char == '-')
> + {
> + str += '-';
> +
> + skip_input ();
> + current_char = peek_input ();
> +
> + additional_length_offset++;
> + }
> + else if (current_char == '+')
> + {
> + // don't add + but still skip input
> + skip_input ();
> + current_char = peek_input ();
> +
> + additional_length_offset++;
> + }
> +
> + // parse another decimal number for exponent
> + auto str_length = parse_in_decimal ();
> + str += std::get<0> (str_length);
> + additional_length_offset += std::get<1> (str_length);
> + }
> + return std::make_pair (str, additional_length_offset);
> +}
> +
> +// Parses a decimal integer.
> +std::tuple<std::string, int, bool>
> +Lexer::parse_in_decimal ()
> +{
> + /* A pure decimal contains only digits. */
> + bool pure_decimal = true;
> + int additional_length_offset = 0;
> + std::string str;
> + while (ISDIGIT (current_char) || current_char == '_')
> + {
> + if (current_char == '_')
> + {
> + pure_decimal = false;
> + // don't add _ to number
> + skip_input ();
> + current_char = peek_input ();
> +
> + additional_length_offset++;
> +
> + continue;
> + }
> +
> + additional_length_offset++;
> +
> + str += current_char;
> + skip_input ();
> + current_char = peek_input ();
> + }
> + return std::make_tuple (str, additional_length_offset, pure_decimal);
> +}
> +
> +/* Parses escapes (and string continues) in "byte" strings and characters. Does
> + * not support unicode. */
> +std::tuple<char, int, bool>
> +Lexer::parse_escape (char opening_char)
> +{
> + int additional_length_offset = 0;
> + char output_char = 0;
> +
> + // skip to actual letter
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> +
> + switch (current_char)
> + {
> + case 'x': {
> + auto hex_escape_pair = parse_partial_hex_escape ();
> + long hexLong = hex_escape_pair.first;
> + additional_length_offset += hex_escape_pair.second;
> +
> + if (hexLong > 255 || hexLong < 0)
> + rust_error_at (
> + get_current_location (),
> + "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
> + static_cast<unsigned int> (hexLong));
> + /* TODO: restore capital for escape output - gcc pretty-printer doesn't
> + * support %X directly */
> + char hexChar = static_cast<char> (hexLong);
> +
> + output_char = hexChar;
> + }
> + break;
> + case 'n':
> + output_char = '\n';
> + break;
> + case 'r':
> + output_char = '\r';
> + break;
> + case 't':
> + output_char = '\t';
> + break;
> + case '\\':
> + output_char = '\\';
> + break;
> + case '0':
> + output_char = '\0';
> + break;
> + case '\'':
> + output_char = '\'';
> + break;
> + case '"':
> + output_char = '"';
> + break;
> + case 'u':
> + rust_error_at (get_current_location (),
> + "cannot have a unicode escape \\u in a byte %s",
> + opening_char == '\'' ? "character" : "string");
> + // Try to parse it anyway, just to skip it
> + parse_partial_unicode_escape ();
> + return std::make_tuple (output_char, additional_length_offset, false);
> + case '\r':
> + case '\n':
> + // string continue
> + return std::make_tuple (0, parse_partial_string_continue (), true);
> + default:
> + rust_error_at (get_current_location (),
> + "unknown escape sequence %<\\%c%>", current_char);
> + // returns false if no parsing could be done
> + // return false;
> + return std::make_tuple (output_char, additional_length_offset, false);
> + break;
> + }
> + // all non-special cases (string continue) should skip their used char
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> +
> + // returns true if parsing was successful
> + // return true;
> + return std::make_tuple (output_char, additional_length_offset, false);
> +}
> +
> +/* Parses an escape (or string continue) in a string or character. Supports
> + * unicode escapes. */
> +std::tuple<Codepoint, int, bool>
> +Lexer::parse_utf8_escape (char opening_char)
> +{
> + Codepoint output_char;
> + int additional_length_offset = 0;
> +
> + // skip to actual letter
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> +
> + switch (current_char)
> + {
> + case 'x': {
> + auto hex_escape_pair = parse_partial_hex_escape ();
> + long hexLong = hex_escape_pair.first;
> + additional_length_offset += hex_escape_pair.second;
> +
> + if (hexLong > 127 || hexLong < 0)
> + rust_error_at (
> + get_current_location (),
> + "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
> + static_cast<unsigned int> (hexLong));
> + /* TODO: restore capital for escape output - gcc pretty-printer doesn't
> + * support %X directly */
> + char hexChar = static_cast<char> (hexLong);
> +
> + output_char = hexChar;
> + }
> + break;
> + case 'n':
> + output_char = '\n';
> + break;
> + case 'r':
> + output_char = '\r';
> + break;
> + case 't':
> + output_char = '\t';
> + break;
> + case '\\':
> + output_char = '\\';
> + break;
> + case '0':
> + output_char = '\0';
> + break;
> + case '\'':
> + output_char = '\'';
> + break;
> + case '"':
> + output_char = '"';
> + break;
> + case 'u': {
> + auto unicode_escape_pair = parse_partial_unicode_escape ();
> + output_char = unicode_escape_pair.first;
> + additional_length_offset += unicode_escape_pair.second;
> +
> + return std::make_tuple (output_char, additional_length_offset, false);
> + }
> + break;
> + case '\r':
> + case '\n':
> + // string continue
> + return std::make_tuple (0, parse_partial_string_continue (), true);
> + default:
> + rust_error_at (get_current_location (),
> + "unknown escape sequence %<\\%c%>", current_char);
> + // returns false if no parsing could be done
> + // return false;
> + return std::make_tuple (output_char, additional_length_offset, false);
> + break;
> + }
> + /* all non-special cases (unicode, string continue) should skip their used
> + * char */
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> +
> + // returns true if parsing was successful
> + // return true;
> + return std::make_tuple (output_char, additional_length_offset, false);
> +}
> +
> +// Parses the body of a string continue that has been found in an escape.
> +int
> +Lexer::parse_partial_string_continue ()
> +{
> + int additional_length_offset = 1;
> +
> + // string continue
> + while (is_whitespace (current_char))
> + {
> + if (current_char == '\n')
> + {
> + current_line++;
> + current_column = 1;
> + // tell line_table that new line starts
> + start_line (current_line, max_column_hint);
> +
> + // reset "length"
> + additional_length_offset = 1;
> +
> + // get next char
> + skip_input ();
> + current_char = peek_input ();
> +
> + continue;
> + }
> +
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> + }
> +
> + return additional_length_offset;
> +}
> +
> +/* Parses the body of a '\x' escape. Note that it does not check that the number
> + * is valid and smaller than 255. */
> +std::pair<long, int>
> +Lexer::parse_partial_hex_escape ()
> +{
> + // hex char string (null-terminated)
> + char hexNum[3] = {0, 0, 0};
> +
> + // first hex char
> + current_char = peek_input (1);
> + int additional_length_offset = 1;
> +
> + if (!is_x_digit (current_char))
> + {
> + rust_error_at (get_current_location (),
> + "invalid character %<\\x%c%> in \\x sequence",
> + current_char);
> + return std::make_pair (0, 0);
> + }
> + hexNum[0] = current_char;
> +
> + // second hex char
> + skip_input ();
> + current_char = peek_input (1);
> + additional_length_offset++;
> +
> + if (!is_x_digit (current_char))
> + {
> + rust_error_at (get_current_location (),
> + "invalid character %<\\x%c%c%> in \\x sequence", hexNum[0],
> + current_char);
> + return std::make_pair (0, 1);
> + }
> + skip_input ();
> + hexNum[1] = current_char;
> +
> + long hexLong = std::strtol (hexNum, nullptr, 16);
> +
> + return std::make_pair (hexLong, additional_length_offset);
> +}
> +
> +// Parses the body of a unicode escape.
> +std::pair<Codepoint, int>
> +Lexer::parse_partial_unicode_escape ()
> +{
> + skip_input ();
> + current_char = peek_input ();
> + int additional_length_offset = 0;
> +
> + if (current_char != '{')
> + {
> + rust_error_at (get_current_location (),
> + "unicode escape should start with %<{%>");
> + /* Skip what should probaby have been between brackets. */
> + while (is_x_digit (current_char) || current_char == '_')
> + {
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> + }
> + return std::make_pair (Codepoint (0), additional_length_offset);
> + }
> +
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> +
> + if (current_char == '_')
> + {
> + rust_error_at (get_current_location (),
> + "unicode escape cannot start with %<_%>");
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> + // fallthrough and try to parse the rest anyway
> + }
> +
> + // parse unicode escape - 1-6 hex digits
> + std::string num_str;
> + num_str.reserve (6);
> +
> + // loop through to add entire hex number to string
> + while (is_x_digit (current_char) || current_char == '_')
> + {
> + if (current_char == '_')
> + {
> + // don't add _ to number
> + skip_input ();
> + current_char = peek_input ();
> +
> + additional_length_offset++;
> +
> + continue;
> + }
> +
> + additional_length_offset++;
> +
> + // add raw hex numbers
> + num_str += current_char;
> +
> + skip_input ();
> + current_char = peek_input ();
> + }
> +
> + if (current_char == '}')
> + {
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> + }
> + else
> + {
> + // actually an error, but allow propagation anyway Assume that
> + // wrong bracketm whitespace or single/double quotes are wrong
> + // termination, otherwise it is a wrong character, then skip to the actual
> + // terminator.
> + if (current_char == '{' || is_whitespace (current_char)
> + || current_char == '\'' || current_char == '"')
> + {
> + rust_error_at (get_current_location (),
> + "expected terminating %<}%> in unicode escape");
> + return std::make_pair (Codepoint (0), additional_length_offset);
> + }
> + else
> + {
> + rust_error_at (get_current_location (),
> + "invalid character %<%c%> in unicode escape",
> + current_char);
> + while (current_char != '}' && current_char != '{'
> + && !is_whitespace (current_char) && current_char != '\''
> + && current_char != '"')
> + {
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> + }
> + // Consume the actual closing bracket if found
> + if (current_char == '}')
> + {
> + skip_input ();
> + current_char = peek_input ();
> + additional_length_offset++;
> + }
> + return std::make_pair (Codepoint (0), additional_length_offset);
> + }
> + }
> +
> + // ensure 1-6 hex characters
> + if (num_str.length () > 6 || num_str.length () < 1)
> + {
> + rust_error_at (get_current_location (),
> + "unicode escape should be between 1 and 6 hex "
> + "characters; it is %lu",
> + (unsigned long) num_str.length ());
> + // return false;
> + return std::make_pair (Codepoint (0), additional_length_offset);
> + }
> +
> + unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
> +
> + if (hex_num > 0xd7ff && hex_num < 0xe000)
> + {
> + rust_error_at (
> + get_current_location (),
> + "unicode escape cannot be a surrogate value (D800 to DFFF)");
> + return std::make_pair (Codepoint (0), additional_length_offset);
> + }
> +
> + if (hex_num > 0x10ffff)
> + {
> + rust_error_at (get_current_location (),
> + "unicode escape cannot be larger than 10FFFF");
> + return std::make_pair (Codepoint (0), additional_length_offset);
> + }
> +
> + // return true;
> + return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
> + additional_length_offset);
> +}
> +
> +// Parses a byte character.
> +TokenPtr
> +Lexer::parse_byte_char (Location loc)
> +{
> + skip_input ();
> + current_column++;
> + // make current char the next character
> + current_char = peek_input ();
> +
> + int length = 1;
> +
> + // char to save
> + char byte_char = 0;
> +
> + // detect escapes
> + if (current_char == '\\')
> + {
> + auto escape_length_pair = parse_escape ('\'');
> + byte_char = std::get<0> (escape_length_pair);
> + length += std::get<1> (escape_length_pair);
> +
> + current_char = peek_input ();
> +
> + if (current_char != '\'')
> + {
> + rust_error_at (get_current_location (), "unclosed %<byte char%>");
> + }
> +
> + skip_input ();
> + current_char = peek_input ();
> + length++; // go to next char
> + }
> + else if (current_char != '\'')
> + {
> + // otherwise, get character from direct input character
> + byte_char = current_char;
> +
> + skip_input ();
> + current_char = peek_input ();
> + length++;
> +
> + if (current_char != '\'')
> + {
> + rust_error_at (get_current_location (), "unclosed %<byte char%>");
> + }
> +
> + skip_input ();
> + current_char = peek_input ();
> + length++; // go to next char
> + }
> + else
> + {
> + rust_error_at (get_current_location (),
> + "no character inside %<%> for %<byte char%>");
> + }
> +
> + current_column += length;
> +
> + return Token::make_byte_char (loc, byte_char);
> +}
> +
> +// Parses a byte string.
> +TokenPtr
> +Lexer::parse_byte_string (Location loc)
> +{
> + // byte string
> +
> + // skip quote character
> + skip_input ();
> + current_column++;
> +
> + std::string str;
> + str.reserve (16); // some sensible default
> +
> + int length = 1;
> + current_char = peek_input ();
> +
> + while (current_char != '"' && current_char != EOF)
> + {
> + if (current_char == '\\')
> + {
> + auto escape_length_pair = parse_escape ('"');
> + char output_char = std::get<0> (escape_length_pair);
> +
> + if (output_char == 0 && std::get<2> (escape_length_pair))
> + length = std::get<1> (escape_length_pair) - 1;
> + else
> + length += std::get<1> (escape_length_pair);
> +
> + if (output_char != 0 || !std::get<2> (escape_length_pair))
> + str += output_char;
> +
> + continue;
> + }
> +
> + length++;
> +
> + str += current_char;
> + skip_input ();
> + current_char = peek_input ();
> + }
> +
> + current_column += length;
> +
> + if (current_char == '"')
> + {
> + current_column++;
> +
> + skip_input ();
> + current_char = peek_input ();
> + }
> + else if (current_char == EOF)
> + {
> + rust_error_at (get_current_location (), "unended byte string literal");
> + return Token::make (END_OF_FILE, get_current_location ());
> + }
> + else
> + {
> + gcc_unreachable ();
> + }
> +
> + str.shrink_to_fit ();
> +
> + return Token::make_byte_string (loc, std::move (str));
> +}
> +
> +// Parses a raw byte string.
> +TokenPtr
> +Lexer::parse_raw_byte_string (Location loc)
> +{
> + // raw byte string literals
> + std::string str;
> + str.reserve (16); // some sensible default
> +
> + int length = 1;
> + int hash_count = 0;
> +
> + // get hash count at beginnning
> + skip_input ();
> + current_char = peek_input ();
> + length++;
> + while (current_char == '#')
> + {
> + hash_count++;
> + length++;
> +
> + skip_input ();
> + current_char = peek_input ();
> + }
> +
> + if (current_char != '"')
> + {
> + rust_error_at (get_current_location (),
> + "raw byte string has no opening %<\"%>");
> + }
> +
> + skip_input ();
> + current_char = peek_input ();
> + length++;
> +
> + while (true)
> + {
> + if (current_char == '"')
> + {
> + bool enough_hashes = true;
> +
> + for (int i = 0; i < hash_count; i++)
> + {
> + if (peek_input (i + 1) != '#')
> + {
> + enough_hashes = false;
> + break;
> + }
> + }
> +
> + if (enough_hashes)
> + {
> + // skip enough input and peek enough input
> + skip_input (hash_count);
> + current_char = peek_input ();
> + length += hash_count + 1;
> + break;
> + }
> + }
> +
> + if ((unsigned char) current_char > 127)
> + {
> + rust_error_at (get_current_location (),
> + "character %<%c%> in raw byte string out of range",
> + current_char);
> + current_char = 0;
> + }
> +
> + length++;
> +
> + str += current_char;
> + skip_input ();
> + current_char = peek_input ();
> + }
> +
> + current_column += length;
> +
> + str.shrink_to_fit ();
> +
> + return Token::make_byte_string (loc, std::move (str));
> +}
> +
> +// Parses a raw identifier.
> +TokenPtr
> +Lexer::parse_raw_identifier (Location loc)
> +{
> + // raw identifier
> + std::string str;
> + str.reserve (16); // default
> +
> + skip_input ();
> + current_char = peek_input ();
> +
> + current_column += 2;
> +
> + bool first_is_underscore = current_char == '_';
> +
> + int length = 0;
> + current_char = peek_input ();
> + // loop through entire name
> + while (ISALPHA (current_char) || ISDIGIT (current_char)
> + || current_char == '_')
> + {
> + length++;
> +
> + str += current_char;
> + skip_input ();
> + current_char = peek_input ();
> + }
> +
> + current_column += length;
> +
> + // if just a single underscore, not an identifier
> + if (first_is_underscore && length == 1)
> + rust_error_at (get_current_location (),
> + "%<_%> is not a valid raw identifier");
> +
> + if (str == "crate" || str == "extern" || str == "self" || str == "super"
> + || str == "Self")
> + {
> + rust_error_at (get_current_location (),
> + "%qs is a forbidden raw identifier", str.c_str ());
> +
> + return nullptr;
> + }
> + else
> + {
> + str.shrink_to_fit ();
> +
> + return Token::make_identifier (loc, std::move (str));
> + }
> +}
> +
> +// skip broken string input (unterminated strings)
> +void
> +Lexer::skip_broken_string_input (int current_char)
> +{
> + while (current_char != '"' && current_char != EOF)
> + {
> + if (current_char == '\n')
> + {
> + current_line++;
> + current_column = 1;
> + }
> + else
> + {
> + current_column++;
> + }
> + skip_input ();
> + current_char = peek_input ();
> + }
> + if (current_char == '"')
> + {
> + current_column++;
> +
> + skip_input ();
> + current_char = peek_input ();
> + }
> + rust_debug ("skipped to %d:%d due to bad quotes", current_line,
> + current_column);
> +}
> +
> +// Parses a unicode string.
> +TokenPtr
> +Lexer::parse_string (Location loc)
> +{
> + Codepoint current_char32;
> +
> + std::string str;
> + str.reserve (16); // some sensible default
> +
> + int length = 1;
> + current_char32 = peek_codepoint_input ();
> +
> + // FIXME: This fails if the input ends. How do we check for EOF?
> + while (current_char32.value != '"' && !current_char32.is_eof ())
> + {
> + if (current_char32.value == '\\')
> + {
> + // parse escape
> + auto utf8_escape_pair = parse_utf8_escape ('\'');
> + current_char32 = std::get<0> (utf8_escape_pair);
> +
> + if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair))
> + length = std::get<1> (utf8_escape_pair) - 1;
> + else
> + length += std::get<1> (utf8_escape_pair);
> +
> + if (current_char32 != Codepoint (0)
> + || !std::get<2> (utf8_escape_pair))
> + str += current_char32;
> +
> + // required as parsing utf8 escape only changes current_char
> + current_char32 = peek_codepoint_input ();
> +
> + continue;
> + }
> +
> + length += get_input_codepoint_length ();
> +
> + str += current_char32;
> + skip_codepoint_input ();
> + current_char32 = peek_codepoint_input ();
> + }
> +
> + current_column += length;
> +
> + if (current_char32.value == '"')
> + {
> + current_column++;
> +
> + skip_input ();
> + current_char = peek_input ();
> + }
> + else if (current_char32.is_eof ())
> + {
> + rust_error_at (get_current_location (), "unended string literal");
> + return Token::make (END_OF_FILE, get_current_location ());
> + }
> + else
> + {
> + gcc_unreachable ();
> + }
> +
> + str.shrink_to_fit ();
> + return Token::make_string (loc, std::move (str));
> +}
> +
> +// Parses an identifier or keyword.
> +TokenPtr
> +Lexer::parse_identifier_or_keyword (Location loc)
> +{
> + std::string str;
> + str.reserve (16); // default
> + str += current_char;
> +
> + bool first_is_underscore = current_char == '_';
> +
> + int length = 1;
> + current_char = peek_input ();
> + // loop through entire name
> + while (ISALPHA (current_char) || ISDIGIT (current_char)
> + || current_char == '_')
> + {
> + length++;
> +
> + str += current_char;
> + skip_input ();
> + current_char = peek_input ();
> + }
> +
> + current_column += length;
> +
> + // if just a single underscore, not an identifier
> + if (first_is_underscore && length == 1)
> + return Token::make (UNDERSCORE, loc);
> +
> + str.shrink_to_fit ();
> +
> + TokenId keyword = classify_keyword (str);
> + if (keyword == IDENTIFIER)
> + return Token::make_identifier (loc, std::move (str));
> + else
> + return Token::make (keyword, loc);
> +}
> +
> +// Possibly returns a raw string token if it exists - otherwise returns null.
> +TokenPtr
> +Lexer::maybe_parse_raw_string (Location loc)
> +{
> + int peek_index = 0;
> + while (peek_input (peek_index) == '#')
> + peek_index++;
> +
> + if (peek_input (peek_index) == '"')
> + return parse_raw_string (loc, peek_index);
> + else
> + return nullptr;
> +}
> +
> +// Returns a raw string token.
> +TokenPtr
> +Lexer::parse_raw_string (Location loc, int initial_hash_count)
> +{
> + // raw string literals
> + std::string str;
> + str.reserve (16); // some sensible default
> +
> + int length = 1 + initial_hash_count;
> +
> + if (initial_hash_count > 0)
> + skip_input (initial_hash_count - 1);
> +
> + current_char = peek_input ();
> +
> + if (current_char != '"')
> + rust_error_at (get_current_location (), "raw string has no opening %<\"%>");
> +
> + length++;
> + skip_input ();
> + Codepoint current_char32 = peek_codepoint_input ();
> +
> + while (!current_char32.is_eof ())
> + {
> + if (current_char32.value == '"')
> + {
> + bool enough_hashes = true;
> +
> + for (int i = 0; i < initial_hash_count; i++)
> + {
> + if (peek_input (i + 1) != '#')
> + {
> + enough_hashes = false;
> + break;
> + }
> + }
> +
> + if (enough_hashes)
> + {
> + // skip enough input and peek enough input
> + skip_input (initial_hash_count);
> + current_char = peek_input ();
> + length += initial_hash_count + 1;
> + break;
> + }
> + }
> +
> + length++;
> +
> + str += current_char32;
> + skip_codepoint_input ();
> + current_char32 = peek_codepoint_input ();
> + }
> +
> + current_column += length;
> +
> + str.shrink_to_fit ();
> +
> + return Token::make_string (loc, std::move (str));
> +}
> +
> +template <typename IsDigitFunc>
> +TokenPtr
> +Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func,
> + std::string existent_str, int base)
> +{
> + int length = 1;
> +
> + skip_input ();
> + current_char = peek_input ();
> +
> + length++;
> +
> + // loop through to add entire number to string
> + while (is_digit_func (current_char) || current_char == '_')
> + {
> + if (current_char == '_')
> + {
> + // don't add _ to number
> + skip_input ();
> + current_char = peek_input ();
> +
> + length++;
> +
> + continue;
> + }
> +
> + length++;
> +
> + // add raw numbers
> + existent_str += current_char;
> + skip_input ();
> + current_char = peek_input ();
> + }
> +
> + // convert value to decimal representation
> + long dec_num = std::strtol (existent_str.c_str (), nullptr, base);
> +
> + existent_str = std::to_string (dec_num);
> +
> + // parse in type suffix if it exists
> + auto type_suffix_pair = parse_in_type_suffix ();
> + PrimitiveCoreType type_hint = type_suffix_pair.first;
> + length += type_suffix_pair.second;
> +
> + current_column += length;
> +
> + if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
> + {
> + rust_error_at (get_current_location (),
> + "invalid type suffix %qs for integer (%s) literal",
> + get_type_hint_string (type_hint),
> + base == 16
> + ? "hex"
> + : (base == 8 ? "octal"
> + : (base == 2 ? "binary"
> + : "<insert unknown base>")));
> + return nullptr;
> + }
> + return Token::make_int (loc, std::move (existent_str), type_hint);
> +}
> +
> +// Parses a hex, binary or octal int literal.
> +TokenPtr
> +Lexer::parse_non_decimal_int_literals (Location loc)
> +{
> + std::string str;
> + str.reserve (16); // some sensible default
> + str += current_char;
> +
> + current_char = peek_input ();
> +
> + if (current_char == 'x')
> + {
> + // hex (integer only)
> + return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
> + }
> + else if (current_char == 'o')
> + {
> + // octal (integer only)
> + return parse_non_decimal_int_literal (loc, is_octal_digit,
> + std::move (str), 8);
> + }
> + else if (current_char == 'b')
> + {
> + // binary (integer only)
> + return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
> + 2);
> + }
> + else
> + {
> + return nullptr;
> + }
> +}
> +
> +// Parses a decimal-based int literal or float literal.
> +TokenPtr
> +Lexer::parse_decimal_int_or_float (Location loc)
> +{
> + std::string str;
> + str.reserve (16); // some sensible default
> + str += current_char;
> +
> + int length = 1;
> + bool first_zero = current_char == '0';
> +
> + current_char = peek_input ();
> +
> + // parse initial decimal integer (or first integer part of float) literal
> + auto initial_decimal = parse_in_decimal ();
> + str += std::get<0> (initial_decimal);
> + length += std::get<1> (initial_decimal);
> +
> + // detect float literal
> + if (current_char == '.' && is_float_digit (peek_input (1)))
> + {
> + // float with a '.', parse another decimal into it
> +
> + // add . to str
> + str += current_char;
> + skip_input ();
> + current_char = peek_input ();
> + length++;
> +
> + // parse another decimal number for float
> + auto second_decimal = parse_in_decimal ();
> + str += std::get<0> (second_decimal);
> + length += std::get<1> (second_decimal);
> +
> + // parse in exponent part if it exists
> + auto exponent_pair = parse_in_exponent_part ();
> + str += exponent_pair.first;
> + length += exponent_pair.second;
> +
> + // parse in type suffix if it exists
> + auto type_suffix_pair = parse_in_type_suffix ();
> + PrimitiveCoreType type_hint = type_suffix_pair.first;
> + length += type_suffix_pair.second;
> +
> + if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
> + && type_hint != CORETYPE_UNKNOWN)
> + {
> + rust_error_at (get_current_location (),
> + "invalid type suffix %qs for floating-point literal",
> + get_type_hint_string (type_hint));
> + // ignore invalid type suffix as everything else seems fine
> + type_hint = CORETYPE_UNKNOWN;
> + }
> +
> + current_column += length;
> +
> + str.shrink_to_fit ();
> + return Token::make_float (loc, std::move (str), type_hint);
> + }
> + else if (current_char == '.' && check_valid_float_dot_end (peek_input (1)))
> + {
> + // float that is just an integer with a terminating '.' character
> +
> + // add . to str
> + str += current_char;
> + skip_input ();
> + current_char = peek_input ();
> + length++;
> +
> + // add a '0' after the . to prevent ambiguity
> + str += '0';
> +
> + // type hint not allowed
> +
> + current_column += length;
> +
> + str.shrink_to_fit ();
> + return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN);
> + }
> + else if (current_char == 'E' || current_char == 'e')
> + {
> + // exponent float with no '.' character
> +
> + // parse exponent part
> + auto exponent_pair = parse_in_exponent_part ();
> + str += exponent_pair.first;
> + length += exponent_pair.second;
> +
> + // parse in type suffix if it exists
> + auto type_suffix_pair = parse_in_type_suffix ();
> + PrimitiveCoreType type_hint = type_suffix_pair.first;
> + length += type_suffix_pair.second;
> +
> + if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
> + && type_hint != CORETYPE_UNKNOWN)
> + {
> + rust_error_at (get_current_location (),
> + "invalid type suffix %qs for floating-point literal",
> + get_type_hint_string (type_hint));
> + // ignore invalid type suffix as everything else seems fine
> + type_hint = CORETYPE_UNKNOWN;
> + }
> +
> + current_column += length;
> +
> + str.shrink_to_fit ();
> + return Token::make_float (loc, std::move (str), type_hint);
> + }
> + else
> + {
> + // is an integer
> +
> + // parse in type suffix if it exists
> + auto type_suffix_pair = parse_in_type_suffix ();
> + PrimitiveCoreType type_hint = type_suffix_pair.first;
> + /* A "real" pure decimal doesn't have a suffix and no zero prefix. */
> + if (type_hint == CORETYPE_UNKNOWN)
> + {
> + bool pure_decimal = std::get<2> (initial_decimal);
> + if (pure_decimal && (!first_zero || str.size () == 1))
> + type_hint = CORETYPE_PURE_DECIMAL;
> + }
> + length += type_suffix_pair.second;
> +
> + current_column += length;
> +
> + str.shrink_to_fit ();
> + return Token::make_int (loc, std::move (str), type_hint);
> + }
> +}
> +
> +TokenPtr
> +Lexer::parse_char_or_lifetime (Location loc)
> +{
> + Codepoint current_char32;
> +
> + int length = 1;
> +
> + current_char32 = peek_codepoint_input ();
> + if (current_char32.is_eof ())
> + return nullptr;
> +
> + // parse escaped char literal
> + if (current_char32.value == '\\')
> + {
> + // parse escape
> + auto utf8_escape_pair = parse_utf8_escape ('\'');
> + current_char32 = std::get<0> (utf8_escape_pair);
> + length += std::get<1> (utf8_escape_pair);
> +
> + if (peek_codepoint_input ().value != '\'')
> + {
> + rust_error_at (get_current_location (), "unended character literal");
> + }
> + else
> + {
> + skip_codepoint_input ();
> + current_char = peek_input ();
> + length++;
> + }
> +
> + current_column += length;
> +
> + return Token::make_char (loc, current_char32);
> + }
> + else
> + {
> + skip_codepoint_input ();
> +
> + if (peek_codepoint_input ().value == '\'')
> + {
> + // parse non-escaped char literal
> +
> + // skip the ' character
> + skip_input ();
> + current_char = peek_input ();
> +
> + // TODO fix due to different widths of utf-8 chars?
> + current_column += 3;
> +
> + return Token::make_char (loc, current_char32);
> + }
> + else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
> + || current_char32.value == '_')
> + {
> + // parse lifetime name
> + std::string str;
> + str += current_char32;
> + length++;
> +
> + current_char = peek_input ();
> + while (ISDIGIT (current_char) || ISALPHA (current_char)
> + || current_char == '_')
> + {
> + str += current_char;
> + skip_input ();
> + current_char = peek_input ();
> + length++;
> + }
> +
> + current_column += length;
> +
> + str.shrink_to_fit ();
> + return Token::make_lifetime (loc, std::move (str));
> + }
> + else
> + {
> + rust_error_at (
> + get_current_location (),
> + "expected %' after character constant in character literal");
> + return nullptr;
> + }
> + }
> +}
> +
> +// Returns the length of the codepoint at the current position.
> +int
> +Lexer::get_input_codepoint_length ()
> +{
> + uint8_t input = peek_input ();
> +
> + if ((int8_t) input == EOF)
> + return 0;
> +
> + if (input < 128)
> + {
> + // ascii -- 1 byte
> + // return input;
> +
> + return 1;
> + }
> + else if ((input & 0xC0) == 0x80)
> + {
> + // invalid (continuation; can't be first char)
> + // return 0xFFFE;
> +
> + return 0;
> + }
> + else if ((input & 0xE0) == 0xC0)
> + {
> + // 2 bytes
> + uint8_t input2 = peek_input (1);
> + if ((input2 & 0xC0) != 0x80)
> + return 0;
> + // return 0xFFFE;
> +
> + // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
> + // return output;
> + return 2;
> + }
> + else if ((input & 0xF0) == 0xE0)
> + {
> + // 3 bytes
> + uint8_t input2 = peek_input (1);
> + if ((input2 & 0xC0) != 0x80)
> + return 0;
> + // return 0xFFFE;
> +
> + uint8_t input3 = peek_input (2);
> + if ((input3 & 0xC0) != 0x80)
> + return 0;
> + // return 0xFFFE;
> +
> + /*uint32_t output
> + = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
> + 0); return output;*/
> + return 3;
> + }
> + else if ((input & 0xF8) == 0xF0)
> + {
> + // 4 bytes
> + uint8_t input2 = peek_input (1);
> + if ((input2 & 0xC0) != 0x80)
> + return 0;
> + // return 0xFFFE;
> +
> + uint8_t input3 = peek_input (2);
> + if ((input3 & 0xC0) != 0x80)
> + return 0;
> + // return 0xFFFE;
> +
> + uint8_t input4 = peek_input (3);
> + if ((input4 & 0xC0) != 0x80)
> + return 0;
> + // return 0xFFFE;
> +
> + /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
> + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
> + return output;*/
> + return 4;
> + }
> + else
> + {
> + rust_error_at (get_current_location (),
> + "invalid UTF-8 [FIRST] (too long)");
> + return 0;
> + }
> +}
> +
> +// Returns the codepoint at the current position.
> +Codepoint
> +Lexer::peek_codepoint_input ()
> +{
> + uint8_t input = peek_input ();
> +
> + if ((int8_t) input == EOF)
> + return Codepoint::eof ();
> +
> + if (input < 128)
> + {
> + // ascii -- 1 byte
> + return {input};
> + }
> + else if ((input & 0xC0) == 0x80)
> + {
> + // invalid (continuation; can't be first char)
> + return {0xFFFE};
> + }
> + else if ((input & 0xE0) == 0xC0)
> + {
> + // 2 bytes
> + uint8_t input2 = peek_input (1);
> + if ((input2 & 0xC0) != 0x80)
> + return {0xFFFE};
> +
> + uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
> + return {output};
> + }
> + else if ((input & 0xF0) == 0xE0)
> + {
> + // 3 bytes
> + uint8_t input2 = peek_input (1);
> + if ((input2 & 0xC0) != 0x80)
> + return {0xFFFE};
> +
> + uint8_t input3 = peek_input (2);
> + if ((input3 & 0xC0) != 0x80)
> + return {0xFFFE};
> +
> + uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
> + | ((input3 & 0x3F) << 0);
> + return {output};
> + }
> + else if ((input & 0xF8) == 0xF0)
> + {
> + // 4 bytes
> + uint8_t input2 = peek_input (1);
> + if ((input2 & 0xC0) != 0x80)
> + return {0xFFFE};
> +
> + uint8_t input3 = peek_input (2);
> + if ((input3 & 0xC0) != 0x80)
> + return {0xFFFE};
> +
> + uint8_t input4 = peek_input (3);
> + if ((input4 & 0xC0) != 0x80)
> + return {0xFFFE};
> +
> + uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
> + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
> + return {output};
> + }
> + else
> + {
> + rust_error_at (get_current_location (),
> + "invalid UTF-8 [SECND] (too long)");
> + return {0xFFFE};
> + }
> +}
> +
> +void
> +Lexer::skip_codepoint_input ()
> +{
> + int toSkip = get_input_codepoint_length ();
> + gcc_assert (toSkip >= 1);
> +
> + skip_input (toSkip - 1);
> +}
> +
> +int
> +Lexer::test_get_input_codepoint_n_length (int n_start_offset)
> +{
> + uint8_t input = peek_input (n_start_offset);
> +
> + if (input < 128)
> + {
> + // ascii -- 1 byte
> + // return input;
> + return 1;
> + }
> + else if ((input & 0xC0) == 0x80)
> + {
> + // invalid (continuation; can't be first char)
> + // return 0xFFFE;
> + return 0;
> + }
> + else if ((input & 0xE0) == 0xC0)
> + {
> + // 2 bytes
> + uint8_t input2 = peek_input (n_start_offset + 1);
> + if ((input2 & 0xC0) != 0x80)
> + // return 0xFFFE;
> + return 0;
> +
> + // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
> + // return output;
> + return 2;
> + }
> + else if ((input & 0xF0) == 0xE0)
> + {
> + // 3 bytes
> + uint8_t input2 = peek_input (n_start_offset + 1);
> + if ((input2 & 0xC0) != 0x80)
> + // return 0xFFFE;
> + return 0;
> +
> + uint8_t input3 = peek_input (n_start_offset + 2);
> + if ((input3 & 0xC0) != 0x80)
> + // return 0xFFFE;
> + return 0;
> +
> + /*uint32_t output
> + = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
> + 0); return output;*/
> + return 3;
> + }
> + else if ((input & 0xF8) == 0xF0)
> + {
> + // 4 bytes
> + uint8_t input2 = peek_input (n_start_offset + 1);
> + if ((input2 & 0xC0) != 0x80)
> + // return 0xFFFE;
> + return 0;
> +
> + uint8_t input3 = peek_input (n_start_offset + 2);
> + if ((input3 & 0xC0) != 0x80)
> + // return 0xFFFE;
> + return 0;
> +
> + uint8_t input4 = peek_input (n_start_offset + 3);
> + if ((input4 & 0xC0) != 0x80)
> + // return 0xFFFE;
> + return 0;
> +
> + /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
> + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
> + return output;*/
> + return 4;
> + }
> + else
> + {
> + rust_error_at (get_current_location (),
> + "invalid UTF-8 [THIRD] (too long)");
> + return 0;
> + }
> +}
> +
> +// peeks the codepoint input at n codepoints ahead of current codepoint - try
> +// not to use
> +Codepoint
> +Lexer::test_peek_codepoint_input (int n)
> +{
> + int totalOffset = 0;
> +
> + // add up all offsets into total offset? does this do what I want?
> + for (int i = 0; i < n; i++)
> + {
> + totalOffset += test_get_input_codepoint_n_length (totalOffset);
> + }
> + // issues: this would have (at least) O(n) lookup time, not O(1) like the
> + // rest?
> +
> + // TODO: implement if still needed
> +
> + // error out of function as it is not implemented
> + gcc_assert (1 == 0);
> + return {0};
> + /*
> + uint8_t input = peek_input();
> +
> + if (input < 128) {
> + // ascii -- 1 byte
> + return input;
> + } else if ((input & 0xC0) == 0x80) {
> + // invalid (continuation; can't be first char)
> + return 0xFFFE;
> + } else if ((input & 0xE0) == 0xC0) {
> + // 2 bytes
> + uint8_t input2 = peek_input(1);
> + if ((input2 & 0xC0) != 0x80)
> + return 0xFFFE;
> +
> + uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
> + return output;
> + } else if ((input & 0xF0) == 0xE0) {
> + // 3 bytes
> + uint8_t input2 = peek_input(1);
> + if ((input2 & 0xC0) != 0x80)
> + return 0xFFFE;
> +
> + uint8_t input3 = peek_input(2);
> + if ((input3 & 0xC0) != 0x80)
> + return 0xFFFE;
> +
> + uint32_t output
> + = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
> + 0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
> + // 4 bytes
> + uint8_t input2 = peek_input(1);
> + if ((input2 & 0xC0) != 0x80)
> + return 0xFFFE;
> +
> + uint8_t input3 = peek_input(2);
> + if ((input3 & 0xC0) != 0x80)
> + return 0xFFFE;
> +
> + uint8_t input4 = peek_input(3);
> + if ((input4 & 0xC0) != 0x80)
> + return 0xFFFE;
> +
> + uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
> + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
> + 0); return output; } else { rust_error_at(get_current_location(), "invalid
> + UTF-8 (too long)"); return 0xFFFE;
> + }*/
> +}
> +
> +void
> +Lexer::split_current_token (TokenId new_left, TokenId new_right)
> +{
> + /* TODO: assert that this TokenId is a "simple token" like punctuation and not
> + * like "IDENTIFIER"? */
> + Location current_loc = peek_token ()->get_locus ();
> + TokenPtr new_left_tok = Token::make (new_left, current_loc);
> + TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
> +
> + token_queue.replace_current_value (std::move (new_left_tok));
> + token_queue.insert (1, std::move (new_right_tok));
> +}
> +
> +void
> +Lexer::start_line (int current_line, int current_column)
> +{
> + if (line_map)
> + line_map->start_line (current_line, current_column);
> +}
> +
> +} // namespace Rust
> diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
> new file mode 100644
> index 00000000000..d5a6c53719f
> --- /dev/null
> +++ b/gcc/rust/lex/rust-lex.h
> @@ -0,0 +1,271 @@
> +// Copyright (C) 2020-2022 Free Software Foundation, Inc.
> +
> +// This file is part of GCC.
> +
> +// GCC is free software; you can redistribute it and/or modify it under
> +// the terms of the GNU General Public License as published by the Free
> +// Software Foundation; either version 3, or (at your option) any later
> +// version.
> +
> +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +// WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> +// for more details.
> +
> +// You should have received a copy of the GNU General Public License
> +// along with GCC; see the file COPYING3. If not see
> +// <http://www.gnu.org/licenses/>.
> +
> +#ifndef RUST_LEX_H
> +#define RUST_LEX_H
> +
> +#include "rust-linemap.h"
> +#include "rust-buffered-queue.h"
> +#include "rust-token.h"
> +
> +namespace Rust {
> +// Simple wrapper for FILE* that simplifies destruction.
> +struct RAIIFile
> +{
> +private:
> + FILE *file;
> + const char *filename;
> +
> + void close ()
> + {
> + if (file != nullptr && file != stdin)
> + fclose (file);
> + }
> +
> +public:
> + RAIIFile (const char *filename) : filename (filename)
> + {
> + if (strcmp (filename, "-") == 0)
> + file = stdin;
> + else
> + file = fopen (filename, "r");
> + }
> +
> + /**
> + * Create a RAIIFile from an existing instance of FILE*
> + */
> + RAIIFile (FILE *raw, const char *filename = nullptr)
> + : file (raw), filename (filename)
> + {}
> +
> + RAIIFile (const RAIIFile &other) = delete;
> + RAIIFile &operator= (const RAIIFile &other) = delete;
> +
> + // have to specify setting file to nullptr, otherwise unintended fclose occurs
> + RAIIFile (RAIIFile &&other) : file (other.file), filename (other.filename)
> + {
> + other.file = nullptr;
> + }
> +
> + RAIIFile &operator= (RAIIFile &&other)
> + {
> + close ();
> + file = other.file;
> + filename = other.filename;
> + other.file = nullptr;
> +
> + return *this;
> + }
> +
> + static RAIIFile create_error () { return RAIIFile (nullptr, nullptr); }
> +
> + ~RAIIFile () { close (); }
> +
> + FILE *get_raw () { return file; }
> + const char *get_filename () { return filename; }
> +
> + bool ok () const { return file; }
> +};
> +
> +class Lexer
> +{
> +private:
> + // Request new Location for current column in line_table
> + Location get_current_location ();
> +
> + // Skips the current input char.
> + void skip_input ();
> + // Advances current input char to n + 1 chars ahead of current position.
> + void skip_input (int n);
> +
> + // Returns char n chars ahead of current position.
> + int peek_input ();
> + // Peeks the current char.
> + int peek_input (int n);
> +
> + // Classifies keyword (i.e. gets id for keyword).
> + TokenId classify_keyword (const std::string &str);
> +
> + // Builds a token from the input queue.
> + TokenPtr build_token ();
> +
> + std::tuple<std::string, int, bool> parse_in_decimal ();
> + std::pair<std::string, int> parse_in_exponent_part ();
> + std::pair<PrimitiveCoreType, int> parse_in_type_suffix ();
> + std::tuple<char, int, bool> parse_escape (char opening_char);
> + std::tuple<Codepoint, int, bool> parse_utf8_escape (char opening_char);
> + int parse_partial_string_continue ();
> + std::pair<long, int> parse_partial_hex_escape ();
> + std::pair<Codepoint, int> parse_partial_unicode_escape ();
> +
> + int get_input_codepoint_length ();
> + int test_get_input_codepoint_n_length (int n_start_offset);
> + Codepoint peek_codepoint_input ();
> + Codepoint test_peek_codepoint_input (int n);
> + void skip_codepoint_input ();
> + void skip_broken_string_input (int current_char);
> +
> + TokenPtr parse_byte_char (Location loc);
> + TokenPtr parse_byte_string (Location loc);
> + TokenPtr parse_raw_byte_string (Location loc);
> + TokenPtr parse_raw_identifier (Location loc);
> + TokenPtr parse_string (Location loc);
> + TokenPtr maybe_parse_raw_string (Location loc);
> + TokenPtr parse_raw_string (Location loc, int initial_hash_count);
> + TokenPtr parse_non_decimal_int_literals (Location loc);
> + TokenPtr parse_decimal_int_or_float (Location loc);
> + TokenPtr parse_char_or_lifetime (Location loc);
> + TokenPtr parse_identifier_or_keyword (Location loc);
> +
> + template <typename IsDigitFunc>
> + TokenPtr parse_non_decimal_int_literal (Location loc,
> + IsDigitFunc is_digit_func,
> + std::string existent_str, int base);
> +
> +public:
> + // Construct lexer with input file and filename provided
> + Lexer (const char *filename, RAIIFile input, Linemap *linemap);
> +
> + // Lex the contents of a string instead of a file
> + Lexer (const std::string &input);
> +
> + // dtor
> + ~Lexer ();
> +
> + // don't allow copy semantics (for now, at least)
> + Lexer (const Lexer &other) = delete;
> + Lexer &operator= (const Lexer &other) = delete;
> +
> + // enable move semantics
> + Lexer (Lexer &&other) = default;
> + Lexer &operator= (Lexer &&other) = default;
> +
> + // Returns token n tokens ahead of current position.
> + const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
> + // Peeks the current token.
> + const_TokenPtr peek_token () { return peek_token (0); }
> +
> + // Advances current token to n + 1 tokens ahead of current position.
> + void skip_token (int n) { token_queue.skip (n); }
> + // Skips the current token.
> + void skip_token () { skip_token (0); }
> +
> + // Replaces the current token with a specified token.
> + void replace_current_token (TokenPtr replacement);
> + // FIXME: don't use anymore
> +
> + /* Splits the current token into two. Intended for use with nested generics
> + * closes (i.e. T<U<X>> where >> is wrongly lexed as one token). Note that
> + * this will only work with "simple" tokens like punctuation. */
> + void split_current_token (TokenId new_left, TokenId new_right);
> +
> + Linemap *get_line_map () { return line_map; }
> + std::string get_filename () { return std::string (input.get_filename ()); }
> +
> +private:
> + void start_line (int current_line, int current_column);
> +
> + // File for use as input.
> + RAIIFile input;
> + // TODO is this actually required? could just have file storage in InputSource
> +
> + // Current line number.
> + int current_line;
> + // Current column number.
> + int current_column;
> + // Current character.
> + int current_char;
> + // Line map.
> + Linemap *line_map;
> +
> + /* Max column number that can be quickly allocated - higher may require
> + * allocating new linemap */
> + static const int max_column_hint = 80;
> +
> + // Input source wrapper thing.
> + class InputSource
> + {
> + public:
> + virtual ~InputSource () {}
> +
> + // Overload operator () to return next char from input stream.
> + virtual int next () = 0;
> + };
> +
> + class FileInputSource : public InputSource
> + {
> + private:
> + // Input source file.
> + FILE *input;
> +
> + public:
> + // Create new input source from file.
> + FileInputSource (FILE *input) : input (input) {}
> +
> + int next () override { return fgetc (input); }
> + };
> +
> + class BufferInputSource : public InputSource
> + {
> + private:
> + const std::string &buffer;
> + size_t offs;
> +
> + public:
> + // Create new input source from file.
> + BufferInputSource (const std::string &b, size_t offset)
> + : buffer (b), offs (offset)
> + {}
> +
> + int next () override
> + {
> + if (offs >= buffer.size ())
> + return EOF;
> +
> + return buffer.at (offs++);
> + }
> + };
> +
> + // The input source for the lexer.
> + // InputSource input_source;
> + // Input file queue.
> + std::unique_ptr<InputSource> raw_input_source;
> + buffered_queue<int, InputSource &> input_queue;
> +
> + // Token source wrapper thing.
> + struct TokenSource
> + {
> + // The lexer object that will use this TokenSource.
> + Lexer *lexer;
> +
> + // Create a new TokenSource with given lexer.
> + TokenSource (Lexer *parLexer) : lexer (parLexer) {}
> +
> + // Overload operator () to build token in lexer.
> + TokenPtr next () { return lexer->build_token (); }
> + };
> +
> + // The token source for the lexer.
> + // TokenSource token_source;
> + // Token stream queue.
> + buffered_queue<std::shared_ptr<Token>, TokenSource> token_queue;
> +};
> +
> +} // namespace Rust
> +
> +#endif
> diff --git a/gcc/rust/lex/rust-token.cc b/gcc/rust/lex/rust-token.cc
> new file mode 100644
> index 00000000000..68313c20b1c
> --- /dev/null
> +++ b/gcc/rust/lex/rust-token.cc
> @@ -0,0 +1,135 @@
> +// Copyright (C) 2020-2022 Free Software Foundation, Inc.
> +
> +// This file is part of GCC.
> +
> +// GCC is free software; you can redistribute it and/or modify it under
> +// the terms of the GNU General Public License as published by the Free
> +// Software Foundation; either version 3, or (at your option) any later
> +// version.
> +
> +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +// WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> +// for more details.
> +
> +// You should have received a copy of the GNU General Public License
> +// along with GCC; see the file COPYING3. If not see
> +// <http://www.gnu.org/licenses/>.
> +
> +#include "rust-token.h"
> +
> +#include "rust-diagnostics.h" // for error_at
> +
> +namespace Rust {
> +// Hackily defined way to get token description for enum value using x-macros
> +const char *
> +get_token_description (TokenId id)
> +{
> + switch (id)
> + {
> +#define RS_TOKEN(name, descr) \
> + case name: \
> + return descr;
> +#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y)
> + RS_TOKEN_LIST
> +#undef RS_TOKEN_KEYWORD
> +#undef RS_TOKEN
> + default:
> + gcc_unreachable ();
> + }
> +}
> +
> +/* Hackily defined way to get token description as a string for enum value using
> + * x-macros */
> +const char *
> +token_id_to_str (TokenId id)
> +{
> + switch (id)
> + {
> +#define RS_TOKEN(name, _) \
> + case name: \
> + return #name;
> +#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y)
> + RS_TOKEN_LIST
> +#undef RS_TOKEN_KEYWORD
> +#undef RS_TOKEN
> + default:
> + gcc_unreachable ();
> + }
> +}
> +
> +const char *
> +get_type_hint_string (PrimitiveCoreType type)
> +{
> + switch (type)
> + {
> + case CORETYPE_BOOL:
> + return "bool";
> + case CORETYPE_CHAR:
> + return "char";
> + case CORETYPE_STR:
> + return "str";
> + // case CORETYPE_INT:
> + case CORETYPE_ISIZE:
> + return "isize";
> + // case CORETYPE_UINT:
> + case CORETYPE_USIZE:
> + return "usize";
> + case CORETYPE_F32:
> + return "f32";
> + case CORETYPE_F64:
> + return "f64";
> + case CORETYPE_I8:
> + return "i8";
> + case CORETYPE_I16:
> + return "i16";
> + case CORETYPE_I32:
> + return "i32";
> + case CORETYPE_I64:
> + return "i64";
> + case CORETYPE_I128:
> + return "i128";
> + case CORETYPE_U8:
> + return "u8";
> + case CORETYPE_U16:
> + return "u16";
> + case CORETYPE_U32:
> + return "u32";
> + case CORETYPE_U64:
> + return "u64";
> + case CORETYPE_U128:
> + return "u128";
> + case CORETYPE_PURE_DECIMAL:
> + return "pure_decimal";
> + case CORETYPE_UNKNOWN:
> + default:
> + return "unknown";
> + }
> +}
> +
> +const char *
> +Token::get_type_hint_str () const
> +{
> + return get_type_hint_string (type_hint);
> +}
> +
> +const std::string &
> +Token::get_str () const
> +{
> + // FIXME: attempt to return null again
> + // gcc_assert(str != NULL);
> +
> + // HACK: allow referencing an empty string
> + static const std::string empty = "";
> +
> + if (str == NULL)
> + {
> + rust_error_at (get_locus (),
> + "attempted to get string for %<%s%>, which has no string. "
> + "returning empty string instead",
> + get_token_description ());
> + return empty;
> + }
> + return *str;
> +}
> +} // namespace Rust
> diff --git a/gcc/rust/lex/rust-token.h b/gcc/rust/lex/rust-token.h
> new file mode 100644
> index 00000000000..3fa46a2cebe
> --- /dev/null
> +++ b/gcc/rust/lex/rust-token.h
> @@ -0,0 +1,455 @@
> +// Copyright (C) 2020-2022 Free Software Foundation, Inc.
> +
> +// This file is part of GCC.
> +
> +// GCC is free software; you can redistribute it and/or modify it under
> +// the terms of the GNU General Public License as published by the Free
> +// Software Foundation; either version 3, or (at your option) any later
> +// version.
> +
> +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +// WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> +// for more details.
> +
> +// You should have received a copy of the GNU General Public License
> +// along with GCC; see the file COPYING3. If not see
> +// <http://www.gnu.org/licenses/>.
> +
> +#ifndef RUST_TOKEN_H
> +#define RUST_TOKEN_H
> +
> +#include "rust-linemap.h"
> +#include "rust-codepoint.h"
> +
> +// order: config, system, coretypes, input
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "input.h"
> +
> +namespace Rust {
> +// "Primitive core types" in Rust - the different int and float types, as well
> +// as some others
> +enum PrimitiveCoreType
> +{
> + CORETYPE_UNKNOWN,
> + // named primitives
> + CORETYPE_BOOL,
> + CORETYPE_CHAR,
> + CORETYPE_STR,
> + // okay technically int and uint are arch-dependent (pointer size)
> + CORETYPE_INT,
> + CORETYPE_UINT,
> + // numbered number primitives
> + CORETYPE_F32,
> + CORETYPE_F64,
> + CORETYPE_I8,
> + CORETYPE_I16,
> + CORETYPE_I32,
> + CORETYPE_I64,
> + CORETYPE_I128,
> + CORETYPE_U8,
> + CORETYPE_U16,
> + CORETYPE_U32,
> + CORETYPE_U64,
> + CORETYPE_U128,
> + // Pure decimals are used for tuple index.
> + // Also means there is no type hint.
> + CORETYPE_PURE_DECIMAL,
> + // arch-dependent pointer sizes
> + CORETYPE_ISIZE = CORETYPE_INT,
> + CORETYPE_USIZE = CORETYPE_UINT
> +};
> +
> +// RS_TOKEN(name, description)
> +// RS_TOKEN_KEYWORD(name, identifier)
> +//
> +// Keep RS_TOKEN_KEYWORD sorted
> +
> +/* note that abstract, async, become, box, do, final, macro, override, priv,
> + * try, typeof, unsized, virtual, and yield are unused */
> +#define RS_TOKEN_LIST \
> + RS_TOKEN (FIRST_TOKEN, "<first-token-marker>") \
> + RS_TOKEN (END_OF_FILE, "end of file") \
> + RS_TOKEN (EXCLAM, "!") \
> + RS_TOKEN (NOT_EQUAL, "!=") \
> + RS_TOKEN (PERCENT, "%") \
> + RS_TOKEN (PERCENT_EQ, "%=") \
> + RS_TOKEN (AMP, "&") \
> + RS_TOKEN (AMP_EQ, "&=") \
> + RS_TOKEN (LOGICAL_AND, "&&") \
> + RS_TOKEN (ASTERISK, "*") \
> + RS_TOKEN (ASTERISK_EQ, "*=") \
> + RS_TOKEN (PLUS, "+") \
> + RS_TOKEN (PLUS_EQ, "+=") \
> + RS_TOKEN (COMMA, ",") \
> + RS_TOKEN (MINUS, "-") \
> + RS_TOKEN (MINUS_EQ, "-=") \
> + RS_TOKEN (RETURN_TYPE, "->") \
> + RS_TOKEN (DOT, ".") \
> + RS_TOKEN (DOT_DOT, "..") \
> + RS_TOKEN (DOT_DOT_EQ, "..=") \
> + RS_TOKEN (ELLIPSIS, "...") \
> + RS_TOKEN (DIV, "/") \
> + RS_TOKEN (DIV_EQ, "/=") \
> + RS_TOKEN (COLON, ":") \
> + RS_TOKEN (SEMICOLON, ";") \
> + RS_TOKEN (LEFT_SHIFT, "<<") \
> + RS_TOKEN (LEFT_SHIFT_EQ, "<<=") \
> + RS_TOKEN (LEFT_ANGLE, "<") \
> + RS_TOKEN (LESS_OR_EQUAL, "<=") \
> + RS_TOKEN (EQUAL, "=") \
> + RS_TOKEN (EQUAL_EQUAL, "==") \
> + RS_TOKEN (MATCH_ARROW, "=>") \
> + RS_TOKEN (RIGHT_ANGLE, ">") \
> + RS_TOKEN (GREATER_OR_EQUAL, ">=") \
> + RS_TOKEN (RIGHT_SHIFT, ">>") \
> + RS_TOKEN (RIGHT_SHIFT_EQ, ">>=") \
> + RS_TOKEN (PATTERN_BIND, "@") \
> + RS_TOKEN (TILDE, "~") \
> + RS_TOKEN (BACKSLASH, "\\") \
> + RS_TOKEN (BACKTICK, "`") \
> + RS_TOKEN (CARET, "^") \
> + RS_TOKEN (CARET_EQ, "^=") \
> + RS_TOKEN (PIPE, "|") \
> + RS_TOKEN (PIPE_EQ, "|=") \
> + RS_TOKEN (OR, "||") \
> + RS_TOKEN (QUESTION_MARK, "?") \
> + RS_TOKEN (HASH, "#") \
> + /* from here on, dodgy and may not be correct. not operators and may be \
> + * symbols */ \
> + /* RS_TOKEN(SPACE, " ") probably too dodgy */ \
> + /* RS_TOKEN(NEWLINE, "\n")*/ \
> + RS_TOKEN (SCOPE_RESOLUTION, "::") /* dodgy */ \
> + RS_TOKEN (SINGLE_QUOTE, "'") /* should i differentiate from lifetime? */ \
> + RS_TOKEN (DOUBLE_QUOTE, "\"") \
> + RS_TOKEN (UNDERSCORE, \
> + "_") /* TODO: treat as reserved word like mrustc instead? */ \
> + RS_TOKEN (IDENTIFIER, "identifier") \
> + RS_TOKEN (INT_LITERAL, \
> + "integer literal") /* do different int and float types need \
> + different literal types? */ \
> + RS_TOKEN (FLOAT_LITERAL, "float literal") \
> + RS_TOKEN (STRING_LITERAL, "string literal") \
> + RS_TOKEN (CHAR_LITERAL, "character literal") \
> + RS_TOKEN (BYTE_STRING_LITERAL, "byte string literal") \
> + RS_TOKEN (BYTE_CHAR_LITERAL, "byte character literal") \
> + RS_TOKEN (LIFETIME, "lifetime") /* TODO: improve token type */ \
> + /* Have "interpolated" tokens (whatever that means)? identifer, path, type, \
> + * pattern, */ \
> + /* expression, statement, block, meta, item in mrustc (but not directly in \
> + * lexer). */ \
> + RS_TOKEN (LEFT_PAREN, "(") \
> + RS_TOKEN (RIGHT_PAREN, ")") \
> + RS_TOKEN (LEFT_CURLY, "{") \
> + RS_TOKEN (RIGHT_CURLY, "}") \
> + RS_TOKEN (LEFT_SQUARE, "[") \
> + RS_TOKEN (RIGHT_SQUARE, "]") \
> + /* Macros */ \
> + RS_TOKEN (DOLLAR_SIGN, "$") \
> + /* Doc Comments */ \
> + RS_TOKEN (INNER_DOC_COMMENT, "#![doc]") \
> + RS_TOKEN (OUTER_DOC_COMMENT, "#[doc]") \
> + /* have "weak" union and 'static keywords? */ \
> + \
> + RS_TOKEN_KEYWORD (ABSTRACT, "abstract") /* unused */ \
> + RS_TOKEN_KEYWORD (AS, "as") \
> + RS_TOKEN_KEYWORD (ASYNC, "async") /* unused */ \
> + RS_TOKEN_KEYWORD (BECOME, "become") /* unused */ \
> + RS_TOKEN_KEYWORD (BOX, "box") /* unused */ \
> + RS_TOKEN_KEYWORD (BREAK, "break") \
> + RS_TOKEN_KEYWORD (CONST, "const") \
> + RS_TOKEN_KEYWORD (CONTINUE, "continue") \
> + RS_TOKEN_KEYWORD (CRATE, "crate") \
> + /* FIXME: Do we need to add $crate (DOLLAR_CRATE) as a reserved kw? */ \
> + RS_TOKEN_KEYWORD (DO, "do") /* unused */ \
> + RS_TOKEN_KEYWORD (DYN, "dyn") \
> + RS_TOKEN_KEYWORD (ELSE, "else") \
> + RS_TOKEN_KEYWORD (ENUM_TOK, "enum") \
> + RS_TOKEN_KEYWORD (EXTERN_TOK, "extern") \
> + RS_TOKEN_KEYWORD (FALSE_LITERAL, "false") \
> + RS_TOKEN_KEYWORD (FINAL_TOK, "final") /* unused */ \
> + RS_TOKEN_KEYWORD (FN_TOK, "fn") \
> + RS_TOKEN_KEYWORD (FOR, "for") \
> + RS_TOKEN_KEYWORD (IF, "if") \
> + RS_TOKEN_KEYWORD (IMPL, "impl") \
> + RS_TOKEN_KEYWORD (IN, "in") \
> + RS_TOKEN_KEYWORD (LET, "let") \
> + RS_TOKEN_KEYWORD (LOOP, "loop") \
> + RS_TOKEN_KEYWORD (MACRO, "macro") /* unused */ \
> + RS_TOKEN_KEYWORD (MATCH_TOK, "match") \
> + RS_TOKEN_KEYWORD (MOD, "mod") \
> + RS_TOKEN_KEYWORD (MOVE, "move") \
> + RS_TOKEN_KEYWORD (MUT, "mut") \
> + RS_TOKEN_KEYWORD (OVERRIDE_TOK, "override") /* unused */ \
> + RS_TOKEN_KEYWORD (PRIV, "priv") /* unused */ \
> + RS_TOKEN_KEYWORD (PUB, "pub") \
> + RS_TOKEN_KEYWORD (REF, "ref") \
> + RS_TOKEN_KEYWORD (RETURN_TOK, "return") \
> + RS_TOKEN_KEYWORD (SELF_ALIAS, \
> + "Self") /* mrustc does not treat this as a reserved word*/ \
> + RS_TOKEN_KEYWORD (SELF, "self") \
> + RS_TOKEN_KEYWORD (STATIC_TOK, "static") \
> + RS_TOKEN_KEYWORD (STRUCT_TOK, "struct") \
> + RS_TOKEN_KEYWORD (SUPER, "super") \
> + RS_TOKEN_KEYWORD (TRAIT, "trait") \
> + RS_TOKEN_KEYWORD (TRUE_LITERAL, "true") \
> + RS_TOKEN_KEYWORD (TRY, "try") /* unused */ \
> + RS_TOKEN_KEYWORD (TYPE, "type") \
> + RS_TOKEN_KEYWORD (TYPEOF, "typeof") /* unused */ \
> + RS_TOKEN_KEYWORD (UNSAFE, "unsafe") \
> + RS_TOKEN_KEYWORD (UNSIZED, "unsized") /* unused */ \
> + RS_TOKEN_KEYWORD (USE, "use") \
> + RS_TOKEN_KEYWORD (VIRTUAL, "virtual") /* unused */ \
> + RS_TOKEN_KEYWORD (WHERE, "where") \
> + RS_TOKEN_KEYWORD (WHILE, "while") \
> + RS_TOKEN_KEYWORD (YIELD, "yield") /* unused */ \
> + \
> + RS_TOKEN (LAST_TOKEN, "<last-token-marker>")
> +
> +// Contains all token types. Crappy implementation via x-macros.
> +enum TokenId
> +{
> +#define RS_TOKEN(name, _) name,
> +#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y)
> + RS_TOKEN_LIST
> +#undef RS_TOKEN_KEYWORD
> +#undef RS_TOKEN
> +};
> +
> +// dodgy "TokenPtr" declaration with Token forward declaration
> +class Token;
> +// A smart pointer (shared_ptr) to Token.
> +typedef std::shared_ptr<Token> TokenPtr;
> +// A smart pointer (shared_ptr) to a constant Token.
> +typedef std::shared_ptr<const Token> const_TokenPtr;
> +
> +// Hackily defined way to get token description for enum value using x-macros
> +const char *
> +get_token_description (TokenId id);
> +/* Hackily defined way to get token description as a string for enum value using
> + * x-macros */
> +const char *
> +token_id_to_str (TokenId id);
> +// Get type hint description as a string.
> +const char *
> +get_type_hint_string (PrimitiveCoreType type);
> +
> +// Represents a single token. Create using factory static methods.
> +class Token
> +{
> +private:
> + // Token kind.
> + TokenId token_id;
> + // Token location.
> + Location locus;
> + // Associated text (if any) of token.
> + std::unique_ptr<std::string> str;
> + // TODO: maybe remove issues and just store std::string as value?
> + /* Type hint for token based on lexer data (e.g. type suffix). Does not exist
> + * for most tokens. */
> + PrimitiveCoreType type_hint;
> +
> + // Token constructor from token id and location. Has a null string.
> + Token (TokenId token_id, Location location)
> + : token_id (token_id), locus (location), str (nullptr),
> + type_hint (CORETYPE_UNKNOWN)
> + {}
> +
> + // Token constructor from token id, location, and a string.
> + Token (TokenId token_id, Location location, std::string &¶mStr)
> + : token_id (token_id), locus (location),
> + str (new std::string (std::move (paramStr))), type_hint (CORETYPE_UNKNOWN)
> + {}
> +
> + // Token constructor from token id, location, and a char.
> + Token (TokenId token_id, Location location, char paramChar)
> + : token_id (token_id), locus (location),
> + str (new std::string (1, paramChar)), type_hint (CORETYPE_UNKNOWN)
> + {}
> +
> + // Token constructor from token id, location, and a "codepoint".
> + Token (TokenId token_id, Location location, Codepoint paramCodepoint)
> + : token_id (token_id), locus (location),
> + str (new std::string (paramCodepoint.as_string ())),
> + type_hint (CORETYPE_UNKNOWN)
> + {}
> +
> + // Token constructor from token id, location, a string, and type hint.
> + Token (TokenId token_id, Location location, std::string &¶mStr,
> + PrimitiveCoreType parType)
> + : token_id (token_id), locus (location),
> + str (new std::string (std::move (paramStr))), type_hint (parType)
> + {}
> +
> +public:
> + // No default constructor.
> + Token () = delete;
> + // Do not copy/assign tokens.
> + Token (const Token &) = delete;
> + Token &operator= (const Token &) = delete;
> +
> + // Allow moving tokens.
> + Token (Token &&other) = default;
> + Token &operator= (Token &&other) = default;
> +
> + ~Token () = default;
> +
> + /* TODO: make_shared (which saves a heap allocation) does not work with the
> + * private constructor */
> +
> + // Makes and returns a new TokenPtr (with null string).
> + static TokenPtr make (TokenId token_id, Location locus)
> + {
> + // return std::make_shared<Token> (token_id, locus);
> + return TokenPtr (new Token (token_id, locus));
> + }
> +
> + // Makes and returns a new TokenPtr of type IDENTIFIER.
> + static TokenPtr make_identifier (Location locus, std::string &&str)
> + {
> + // return std::make_shared<Token> (IDENTIFIER, locus, str);
> + return TokenPtr (new Token (IDENTIFIER, locus, std::move (str)));
> + }
> +
> + // Makes and returns a new TokenPtr of type INT_LITERAL.
> + static TokenPtr make_int (Location locus, std::string &&str,
> + PrimitiveCoreType type_hint = CORETYPE_UNKNOWN)
> + {
> + // return std::make_shared<Token> (INT_LITERAL, locus, str, type_hint);
> + return TokenPtr (
> + new Token (INT_LITERAL, locus, std::move (str), type_hint));
> + }
> +
> + // Makes and returns a new TokenPtr of type FLOAT_LITERAL.
> + static TokenPtr make_float (Location locus, std::string &&str,
> + PrimitiveCoreType type_hint = CORETYPE_UNKNOWN)
> + {
> + // return std::make_shared<Token> (FLOAT_LITERAL, locus, str, type_hint);
> + return TokenPtr (
> + new Token (FLOAT_LITERAL, locus, std::move (str), type_hint));
> + }
> +
> + // Makes and returns a new TokenPtr of type STRING_LITERAL.
> + static TokenPtr make_string (Location locus, std::string &&str)
> + {
> + // return std::make_shared<Token> (STRING_LITERAL, locus, str,
> + // CORETYPE_STR);
> + return TokenPtr (
> + new Token (STRING_LITERAL, locus, std::move (str), CORETYPE_STR));
> + }
> +
> + // Makes and returns a new TokenPtr of type CHAR_LITERAL.
> + static TokenPtr make_char (Location locus, Codepoint char_lit)
> + {
> + // return std::make_shared<Token> (CHAR_LITERAL, locus, char_lit);
> + return TokenPtr (new Token (CHAR_LITERAL, locus, char_lit));
> + }
> +
> + // Makes and returns a new TokenPtr of type BYTE_CHAR_LITERAL.
> + static TokenPtr make_byte_char (Location locus, char byte_char)
> + {
> + // return std::make_shared<Token> (BYTE_CHAR_LITERAL, locus, byte_char);
> + return TokenPtr (new Token (BYTE_CHAR_LITERAL, locus, byte_char));
> + }
> +
> + // Makes and returns a new TokenPtr of type BYTE_STRING_LITERAL (fix).
> + static TokenPtr make_byte_string (Location locus, std::string &&str)
> + {
> + // return std::make_shared<Token> (BYTE_STRING_LITERAL, locus, str);
> + return TokenPtr (new Token (BYTE_STRING_LITERAL, locus, std::move (str)));
> + }
> +
> + // Makes and returns a new TokenPtr of type INNER_DOC_COMMENT.
> + static TokenPtr make_inner_doc_comment (Location locus, std::string &&str)
> + {
> + return TokenPtr (new Token (INNER_DOC_COMMENT, locus, std::move (str)));
> + }
> +
> + // Makes and returns a new TokenPtr of type OUTER_DOC_COMMENT.
> + static TokenPtr make_outer_doc_comment (Location locus, std::string &&str)
> + {
> + return TokenPtr (new Token (OUTER_DOC_COMMENT, locus, std::move (str)));
> + }
> +
> + // Makes and returns a new TokenPtr of type LIFETIME.
> + static TokenPtr make_lifetime (Location locus, std::string &&str)
> + {
> + // return std::make_shared<Token> (LIFETIME, locus, str);
> + return TokenPtr (new Token (LIFETIME, locus, std::move (str)));
> + }
> +
> + // Gets id of the token.
> + TokenId get_id () const { return token_id; }
> +
> + // Gets location of the token.
> + Location get_locus () const { return locus; }
> +
> + // Gets string description of the token.
> + const std::string &
> + get_str () const; /*{
> +// FIXME: put in header again when fix null problem
> +//gcc_assert(str != nullptr);
> +if (str == nullptr) {
> +error_at(get_locus(), "attempted to get string for '%s', which has no string.
> +returning empty string instead.", get_token_description()); return "";
> +}
> +return *str;
> +}*/
> +
> + // Gets token's type hint info.
> + PrimitiveCoreType get_type_hint () const
> + {
> + return type_hint == CORETYPE_PURE_DECIMAL ? CORETYPE_UNKNOWN : type_hint;
> + }
> +
> + // diagnostics (error reporting)
> + const char *get_token_description () const
> + {
> + return Rust::get_token_description (token_id);
> + }
> +
> + // debugging
> + const char *token_id_to_str () const
> + {
> + return Rust::token_id_to_str (token_id);
> + }
> +
> + // debugging
> + const char *get_type_hint_str () const;
> +
> + /* Returns whether the token is a literal of any type (int, float, char,
> + * string, byte char, byte string). */
> + bool is_literal () const
> + {
> + switch (token_id)
> + {
> + case INT_LITERAL:
> + case FLOAT_LITERAL:
> + case CHAR_LITERAL:
> + case STRING_LITERAL:
> + case BYTE_CHAR_LITERAL:
> + case BYTE_STRING_LITERAL:
> + return true;
> + default:
> + return false;
> + }
> + }
> +
> + /* Returns whether the token actually has a string (regardless of whether it
> + * should or not). */
> + bool has_str () const { return str != nullptr; }
> +
> + // Returns whether the token should have a string.
> + bool should_have_str () const
> + {
> + return is_literal () || token_id == IDENTIFIER || token_id == LIFETIME;
> + }
> +
> + // Returns whether the token is a pure decimal int literal
> + bool is_pure_decimal () const { return type_hint == CORETYPE_PURE_DECIMAL; }
> +};
> +} // namespace Rust
> +
> +#endif
> diff --git a/gcc/rust/rust-buffered-queue.h b/gcc/rust/rust-buffered-queue.h
> new file mode 100644
> index 00000000000..afcc4670cac
> --- /dev/null
> +++ b/gcc/rust/rust-buffered-queue.h
> @@ -0,0 +1,204 @@
> +// Copyright (C) 2020-2022 Free Software Foundation, Inc.
> +
> +// This file is part of GCC.
> +
> +// GCC is free software; you can redistribute it and/or modify it under
> +// the terms of the GNU General Public License as published by the Free
> +// Software Foundation; either version 3, or (at your option) any later
> +// version.
> +
> +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +// WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> +// for more details.
> +
> +// You should have received a copy of the GNU General Public License
> +// along with GCC; see the file COPYING3. If not see
> +// <http://www.gnu.org/licenses/>.
> +
> +#ifndef RUST_BUFFERED_QUEUE_H
> +#define RUST_BUFFERED_QUEUE_H
> +
> +#include "rust-system.h"
> +
> +namespace Rust {
> +/* Buffered queue implementation. Items are of type T, queue source is of type
> + * Source. Note that this is owning of the source. */
> +template <typename T, typename Source> class buffered_queue
> +{
> +public:
> + // Construct empty queue from Source src.
> + buffered_queue (Source src) : source (src), start (0), end (0), buffer () {}
> +
> + /* disable copying (since source is probably non-copyable)
> + * TODO is this actually a good idea? If source is non-copyable, it would
> + * just delete the copy constructor anyway.*/
> + buffered_queue (const buffered_queue &other) = delete;
> + buffered_queue &operator= (const buffered_queue &other) = delete;
> +
> + // enable moving
> + buffered_queue (buffered_queue &&other) = default;
> + buffered_queue &operator= (buffered_queue &&other) = default;
> +
> + // Returns token at position start + n (i.e. n tokens ahead).
> + T peek (int n)
> + {
> + // n should not be behind
> + rust_assert (n >= 0);
> +
> + int num_queued_items = end - start;
> + int num_items_required = n + 1;
> +
> + // if required items go past end of queue, add them to queue
> + if (num_items_required > num_queued_items)
> + {
> + int num_items_to_read = num_items_required - num_queued_items;
> +
> + /* if queue length + extra items is larger than buffer size, expand
> + * buffer */
> + if (end + num_items_to_read > (int) buffer.size ())
> + {
> + // Resize the buffer by 1.5x
> + int new_size = (buffer.size () + num_items_to_read);
> + new_size += (new_size >> 1);
> +
> + // old method:
> + /*
> + // create new queue buffer with new size
> + std::vector<T> new_queue (new_size);
> + std::copy (buffer.begin () + start, buffer.begin () + end,
> + new_queue.begin ());
> + start = 0;
> + end = num_queued_items;
> + // TODO: would move be better here? optimisation for move with
> + // shared pointer?
> +
> + // swap member buffer and new queue buffer
> + std::swap (buffer, new_queue);
> + */
> +
> + // TODO: determine overhead of this approach vs copy. Should be
> + // lower.
> + std::vector<T> new_queue;
> + new_queue.reserve (new_size);
> + new_queue.insert (new_queue.begin (),
> + std::make_move_iterator (buffer.begin () + start),
> + std::make_move_iterator (buffer.begin () + end));
> + start = 0;
> + end = num_queued_items;
> + // fill up rest of vector with junk so that indexing can work
> + new_queue.insert (new_queue.begin () + end,
> + new_size - new_queue.size (), T ());
> +
> + buffer = std::move (new_queue);
> + /* this should be best method - std::move(range) would have
> + * allocation problems; initial construction would require
> + * reallocation upon resizing */
> +
> + // validate that buffer is large enough now
> + rust_assert (end + num_items_to_read <= (int) buffer.size ());
> + }
> +
> + /* iterate through buffer and invoke operator () on source on values
> + * past original end */
> + for (int i = 0; i < num_items_to_read; i++)
> + buffer[end + i] = source.next ();
> +
> + // move end based on additional items added
> + end += num_items_to_read;
> + }
> +
> + rust_assert (0 <= start);
> + rust_assert (start <= end);
> + rust_assert (end <= (int) buffer.size ());
> +
> + rust_assert (start + n < end);
> +
> + // return value at start + n in buffer
> + return buffer[start + n];
> + }
> +
> + /* TODO: add faster peek current token to remove overhead of conditional
> + * branches? */
> +
> + // Advances start by n + 1.
> + void skip (int n)
> + {
> + // Call peek to ensure requested n is actually in queue.
> + peek (n);
> +
> + // Clear queue values from start to n (inclusive).
> + for (int i = 0; i < (n + 1); i++)
> + buffer[start + i] = T ();
> +
> + // Move start forward by n + 1.
> + start += (n + 1);
> +
> + // Ensure start is not impossible somehow
> + rust_assert (0 <= start);
> + rust_assert (start <= end);
> +
> + // Compact buffer if empty
> + if (start == end)
> + start = end = 0;
> + }
> +
> + /* Inserts element at front of vector. Really dirty hack with terrible
> + * performance, only use when really needed. */
> + void insert_at_front (T elem_to_insert)
> + {
> + // TODO: test as this may not work properly
> +
> + // Insert actual element in buffer at start.
> + buffer.insert (buffer.begin (), elem_to_insert);
> +
> + /* Increase the end number since added element means all others have shifted
> + * one along */
> + end++;
> + }
> +
> + // Insert at arbitrary position (attempt)
> + void insert (int index, T elem_to_insert)
> + {
> + // TODO: test as this may not work properly
> +
> + // n should not be behind
> + rust_assert (index >= 0);
> +
> + // call peek to ensure that the items behind this (at least) are in queue
> + if (index >= 1)
> + peek (index - 1);
> + else
> + peek (index);
> +
> + buffer.insert (buffer.begin () + start + index, std::move (elem_to_insert));
> +
> + end++;
> + }
> +
> + // Replaces the current value in the buffer. Total HACK.
> + void replace_current_value (T replacement)
> + {
> + // call peek to ensure value exists
> + peek (0);
> +
> + buffer[start] = std::move (replacement);
> +
> + // don't move start or end
> + }
> +
> +private:
> + // Source of tokens for queue.
> + Source source;
> +
> + // Begin of range in buffer, inclusive.
> + int start;
> + // End of range in buffer, exclusive.
> + int end;
> +
> + // Queue buffer.
> + std::vector<T> buffer;
> +};
> +} // namespace Rust
> +
> +#endif
> --
> 2.25.1
>
On Wed, Sep 14, 2022 at 03:30:39PM +0200, Richard Biener via Gcc-patches wrote:
> > +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> > +// WARRANTY; without even the implied warranty of MERCHANTABILITY or
> > +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
> > +// for more details.
> > +
> > +// You should have received a copy of the GNU General Public License
> > +// along with GCC; see the file COPYING3. If not see
> > +// <http://www.gnu.org/licenses/>.
> > +
> > +#include "rust-lex.h"
> > +
> > +#include "rust-system.h" // for rust_assert and rust_unreachable
> > +#include "rust-diagnostics.h" // for rust_error_at
> > +#include "rust-linemap.h"
> > +#include "rust-session-manager.h"
> > +#include "safe-ctype.h"
>
> just diving into a random patch here - I'm assuming I can take rust-lex.cc as
> a boiler-plate example for the #include structure.
>
> In GCC all files should start with #including "config.h" followed by
> "system.h" where _all_ system, including C++ standard library headers
> should be pulled via system.h to allow working around OS and system
> compiler issues.
>
> It might be that rust-system.h plays the role of config.h + system.h
> but then the rust-lex.h include is before it.
>
> rust-codepoint.h including <string> is also problematic btw.
E.g. the Go FE has two parts, one very GCC specific that uses the explicit
config.h + system.h etc. includes, the other is generic and there it
includes go-system.h in every file first, where that starts with
#include <config.h>
various C++ standard includes
#include <system.h>
etc.
Jakub
new file mode 100644
@@ -0,0 +1,46 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_CODEPOINT_H
+#define RUST_CODEPOINT_H
+
+#include <string>
+
+namespace Rust {
+struct Codepoint
+{
+ uint32_t value;
+
+ // Creates a zero codepoint.
+ Codepoint () : value (0) {}
+
+ // Creates a codepoint from an encoded UTF-8 value.
+ Codepoint (uint32_t value) : value (value) {}
+
+ static Codepoint eof () { return Codepoint (UINT32_MAX); }
+ bool is_eof () const { return value == UINT32_MAX; }
+
+ // Returns a C++ string containing string value of codepoint.
+ std::string as_string ();
+
+ bool operator== (Codepoint other) const { return value == other.value; }
+ bool operator!= (Codepoint other) const { return !operator== (other); }
+};
+} // namespace Rust
+
+#endif
new file mode 100644
@@ -0,0 +1,2729 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+#include "rust-lex.h"
+
+#include "rust-system.h" // for rust_assert and rust_unreachable
+#include "rust-diagnostics.h" // for rust_error_at
+#include "rust-linemap.h"
+#include "rust-session-manager.h"
+#include "safe-ctype.h"
+
+namespace Rust {
+// TODO: move to separate compilation unit?
+// overload += for uint32_t to allow 32-bit encoded utf-8 to be added
+std::string &
+operator+= (std::string &str, Codepoint char32)
+{
+ if (char32.value < 0x80)
+ {
+ str += static_cast<char> (char32.value);
+ }
+ else if (char32.value < (0x1F + 1) << (1 * 6))
+ {
+ str += static_cast<char> (0xC0 | ((char32.value >> 6) & 0x1F));
+ str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
+ }
+ else if (char32.value < (0x0F + 1) << (2 * 6))
+ {
+ str += static_cast<char> (0xE0 | ((char32.value >> 12) & 0x0F));
+ str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
+ str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
+ }
+ else if (char32.value < (0x07 + 1) << (3 * 6))
+ {
+ str += static_cast<char> (0xF0 | ((char32.value >> 18) & 0x07));
+ str += static_cast<char> (0x80 | ((char32.value >> 12) & 0x3F));
+ str += static_cast<char> (0x80 | ((char32.value >> 6) & 0x3F));
+ str += static_cast<char> (0x80 | ((char32.value >> 0) & 0x3F));
+ }
+ else
+ {
+ rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value);
+ }
+ return str;
+}
+
+std::string
+Codepoint::as_string ()
+{
+ std::string str;
+
+ // str += Codepoint (value);
+ str += *this;
+
+ return str;
+}
+
+/* Includes all allowable float digits EXCEPT _ and . as that needs lookahead
+ * for handling. */
+bool
+is_float_digit (char number)
+{
+ return ISDIGIT (number) || number == 'E' || number == 'e';
+}
+
+/* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or
+ * whatever is different */
+bool
+is_x_digit (char number)
+{
+ return ISXDIGIT (number);
+}
+
+bool
+is_octal_digit (char number)
+{
+ return number >= '0' && number <= '7';
+}
+
+bool
+is_bin_digit (char number)
+{
+ return number == '0' || number == '1';
+}
+
+bool
+check_valid_float_dot_end (char character)
+{
+ return character != '.' && character != '_' && !ISALPHA (character);
+}
+
+// ISSPACE from safe-ctype but may change in future
+bool
+is_whitespace (char character)
+{
+ return ISSPACE (character);
+}
+
+bool
+is_non_decimal_int_literal_separator (char character)
+{
+ return character == 'x' || character == 'o' || character == 'b';
+}
+
+Lexer::Lexer (const std::string &input)
+ : input (RAIIFile::create_error ()), current_line (1), current_column (1),
+ line_map (nullptr), raw_input_source (new BufferInputSource (input, 0)),
+ input_queue{*raw_input_source}, token_queue (TokenSource (this))
+{}
+
+Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap)
+ : input (std::move (file_input)), current_line (1), current_column (1),
+ line_map (linemap),
+ raw_input_source (new FileInputSource (input.get_raw ())),
+ input_queue{*raw_input_source}, token_queue (TokenSource (this))
+{
+ // inform line_table that file is being entered and is in line 1
+ if (linemap)
+ line_map->start_file (filename, current_line);
+}
+
+Lexer::~Lexer ()
+{
+ /* ok apparently stop (which is equivalent of original code in destructor) is
+ * meant to be called after all files have finished parsing, for cleanup. On
+ * the other hand, actual code that it calls to leave a certain line map is
+ * mentioned in GCC docs as being useful for "just leaving an included header"
+ * and stuff like that, so this line mapping functionality may need fixing.
+ * FIXME: find out whether this occurs. */
+
+ // line_map->stop();
+}
+
+/* TODO: need to optimise somehow to avoid the virtual function call in the
+ * tight loop. Best idea at the moment is CRTP, but that might make lexer
+ * implementation annoying when storing the "base class" (i.e. would need
+ * template parameter everywhere), although in practice it would mostly just
+ * look ugly and make enclosing classes like Parser also require a type
+ * parameter. At this point a macro might be better. OK I guess macros can be
+ * replaced by constexpr if or something if possible. */
+Location
+Lexer::get_current_location ()
+{
+ if (line_map)
+ return line_map->get_location (current_column);
+ else
+ // If we have no linemap, we're lexing something without proper locations
+ return Location ();
+}
+
+int
+Lexer::peek_input (int n)
+{
+ return input_queue.peek (n);
+}
+
+int
+Lexer::peek_input ()
+{
+ return peek_input (0);
+}
+
+void
+Lexer::skip_input (int n)
+{
+ input_queue.skip (n);
+}
+
+void
+Lexer::skip_input ()
+{
+ skip_input (0);
+}
+
+void
+Lexer::replace_current_token (TokenPtr replacement)
+{
+ token_queue.replace_current_value (replacement);
+
+ rust_debug ("called 'replace_current_token' - this is deprecated");
+}
+
+/* shitty anonymous namespace that can only be accessed inside the compilation
+ * unit - used for classify_keyword binary search in sorted array of keywords
+ * created with x-macros. */
+namespace {
+// TODO: make constexpr when update to c++20
+const std::string keyword_index[] = {
+#define RS_TOKEN(x, y)
+#define RS_TOKEN_KEYWORD(name, keyword) keyword,
+ RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+};
+
+constexpr TokenId keyword_keys[] = {
+#define RS_TOKEN(x, y)
+#define RS_TOKEN_KEYWORD(name, keyword) name,
+ RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+};
+
+constexpr int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index);
+} // namespace
+
+/* Determines whether the string passed in is a keyword or not. If it is, it
+ * returns the keyword name. */
+TokenId
+Lexer::classify_keyword (const std::string &str)
+{
+ const std::string *last = keyword_index + num_keywords;
+ const std::string *idx = std::lower_bound (keyword_index, last, str);
+
+ if (idx == last || str != *idx)
+ return IDENTIFIER;
+
+ // TODO: possibly replace this x-macro system with something like hash map?
+
+ // We now have the expected token ID of the reserved keyword. However, some
+ // keywords are reserved starting in certain editions. For example, `try` is
+ // only a reserved keyword in editions >=2018. The language might gain new
+ // reserved keywords in the future.
+ //
+ // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords
+ auto id = keyword_keys[idx - keyword_index];
+
+ // `try` is not a reserved keyword before 2018
+ if (Session::get_instance ().options.get_edition ()
+ == CompileOptions::Edition::E2015
+ && id == TRY)
+ return IDENTIFIER;
+
+ return id;
+}
+
+TokenPtr
+Lexer::build_token ()
+{
+ // loop to go through multiple characters to build a single token
+ while (true)
+ {
+ Location loc = get_current_location ();
+ current_char = peek_input ();
+ skip_input ();
+
+ // detect UTF8 bom
+ //
+ // Must be the first thing on the first line.
+ // There might be an optional BOM (Byte Order Mark), which for UTF-8 is
+ // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
+ if (current_line == 1 && current_column == 1 && current_char == 0xef
+ && peek_input () == 0xbb && peek_input (1) == 0xbf)
+ {
+ skip_input (1);
+ current_char = peek_input ();
+ skip_input ();
+ }
+
+ // detect shebang
+ // Must be the first thing on the first line, starting with #!
+ // But since an attribute can also start with an #! we don't count it as a
+ // shebang line when after any whitespace or comments there is a [. If it
+ // is a shebang line we simple drop the line. Otherwise we don't consume
+ // any characters and fall through to the real tokenizer.
+ if (current_line == 1 && current_column == 1 && current_char == '#'
+ && peek_input () == '!')
+ {
+ int n = 1;
+ while (true)
+ {
+ int next_char = peek_input (n);
+ if (is_whitespace (next_char))
+ n++;
+ else if ((next_char == '/' && peek_input (n + 1) == '/'
+ && peek_input (n + 2) != '!'
+ && peek_input (n + 2) != '/')
+ || (next_char == '/' && peek_input (n + 1) == '/'
+ && peek_input (n + 2) == '/'
+ && peek_input (n + 3) == '/'))
+ {
+ // two // or four ////
+ // A single line comment
+ // (but not an inner or outer doc comment)
+ n += 2;
+ next_char = peek_input (n);
+ while (next_char != '\n' && next_char != EOF)
+ {
+ n++;
+ next_char = peek_input (n);
+ }
+ if (next_char == '\n')
+ n++;
+ }
+ else if (next_char == '/' && peek_input (n + 1) == '*'
+ && peek_input (n + 2) == '*'
+ && peek_input (n + 3) == '/')
+ {
+ /**/
+ n += 4;
+ }
+ else if (next_char == '/' && peek_input (n + 1) == '*'
+ && peek_input (n + 2) == '*' && peek_input (n + 3) == '*'
+ && peek_input (n + 4) == '/')
+ {
+ /***/
+ n += 5;
+ }
+ else if ((next_char == '/' && peek_input (n + 1) == '*'
+ && peek_input (n + 2) != '*'
+ && peek_input (n + 2) != '!')
+ || (next_char == '/' && peek_input (n + 1) == '*'
+ && peek_input (n + 2) == '*'
+ && peek_input (n + 3) == '*'))
+ {
+ // one /* or three /***
+ // Start of a block comment
+ // (but not an inner or outer doc comment)
+ n += 2;
+ int level = 1;
+ while (level > 0)
+ {
+ if (peek_input (n) == EOF)
+ break;
+ else if (peek_input (n) == '/'
+ && peek_input (n + 1) == '*')
+ {
+ n += 2;
+ level += 1;
+ }
+ else if (peek_input (n) == '*'
+ && peek_input (n + 1) == '/')
+ {
+ n += 2;
+ level -= 1;
+ }
+ else
+ n++;
+ }
+ }
+ else if (next_char != '[')
+ {
+ // definitely shebang, ignore the first line
+ while (current_char != '\n' && current_char != EOF)
+ {
+ current_char = peek_input ();
+ skip_input ();
+ }
+
+ // newline
+ current_line++;
+ current_column = 1;
+ // tell line_table that new line starts
+ start_line (current_line, max_column_hint);
+ break;
+ }
+ else
+ break; /* Definitely not a shebang line. */
+ }
+ }
+
+ // return end of file token if end of file
+ if (current_char == EOF)
+ return Token::make (END_OF_FILE, loc);
+
+ // if not end of file, start tokenising
+ switch (current_char)
+ {
+ /* ignore whitespace characters for tokens but continue updating
+ * location */
+ case '\n': // newline
+ current_line++;
+ current_column = 1;
+ // tell line_table that new line starts
+ start_line (current_line, max_column_hint);
+ continue;
+ case '\r': // cr
+ // Ignore, we expect a newline (lf) soon.
+ continue;
+ case ' ': // space
+ current_column++;
+ continue;
+ case '\t': // tab
+ // width of a tab is not well-defined, assume 8 spaces
+ current_column += 8;
+ continue;
+
+ // punctuation - actual tokens
+ case '=':
+ if (peek_input () == '>')
+ {
+ // match arm arrow
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (MATCH_ARROW, loc);
+ }
+ else if (peek_input () == '=')
+ {
+ // equality operator
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (EQUAL_EQUAL, loc);
+ }
+ else
+ {
+ // assignment operator
+ current_column++;
+ return Token::make (EQUAL, loc);
+ }
+ case '(':
+ current_column++;
+ return Token::make (LEFT_PAREN, loc);
+ case '-':
+ if (peek_input () == '>')
+ {
+ // return type specifier
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (RETURN_TYPE, loc);
+ }
+ else if (peek_input () == '=')
+ {
+ // minus-assign
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (MINUS_EQ, loc);
+ }
+ else
+ {
+ // minus
+ current_column++;
+ return Token::make (MINUS, loc);
+ }
+ case '+':
+ if (peek_input () == '=')
+ {
+ // add-assign
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (PLUS_EQ, loc);
+ }
+ else
+ {
+ // add
+ current_column++;
+ return Token::make (PLUS, loc);
+ }
+ case ')':
+ current_column++;
+ return Token::make (RIGHT_PAREN, loc);
+ case ';':
+ current_column++;
+ return Token::make (SEMICOLON, loc);
+ case '*':
+ if (peek_input () == '=')
+ {
+ // multiplication-assign
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (ASTERISK_EQ, loc);
+ }
+ else
+ {
+ // multiplication
+ current_column++;
+ return Token::make (ASTERISK, loc);
+ }
+ case ',':
+ current_column++;
+ return Token::make (COMMA, loc);
+ case '/':
+ if (peek_input () == '=')
+ {
+ // division-assign
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (DIV_EQ, loc);
+ }
+ else if ((peek_input () == '/' && peek_input (1) != '!'
+ && peek_input (1) != '/')
+ || (peek_input () == '/' && peek_input (1) == '/'
+ && peek_input (2) == '/'))
+ {
+ // two // or four ////
+ // single line comment
+ // (but not an inner or outer doc comment)
+ skip_input ();
+ current_column += 2;
+ current_char = peek_input ();
+
+ // basically ignore until line finishes
+ while (current_char != '\n' && current_char != EOF)
+ {
+ skip_input ();
+ current_column++; // not used
+ current_char = peek_input ();
+ }
+ continue;
+ }
+ else if (peek_input () == '/'
+ && (peek_input (1) == '!' || peek_input (1) == '/'))
+ {
+ /* single line doc comment, inner or outer. */
+ bool is_inner = peek_input (1) == '!';
+ skip_input (1);
+ current_column += 3;
+
+ std::string str;
+ str.reserve (32);
+ current_char = peek_input ();
+ while (current_char != '\n')
+ {
+ skip_input ();
+ if (current_char == '\r')
+ {
+ char next_char = peek_input ();
+ if (next_char == '\n')
+ {
+ current_char = '\n';
+ break;
+ }
+ rust_error_at (
+ loc, "Isolated CR %<\\r%> not allowed in doc comment");
+ current_char = next_char;
+ continue;
+ }
+ if (current_char == EOF)
+ {
+ rust_error_at (
+ loc, "unexpected EOF while looking for end of comment");
+ break;
+ }
+ str += current_char;
+ current_char = peek_input ();
+ }
+ skip_input ();
+ current_line++;
+ current_column = 1;
+ // tell line_table that new line starts
+ start_line (current_line, max_column_hint);
+
+ str.shrink_to_fit ();
+ if (is_inner)
+ return Token::make_inner_doc_comment (loc, std::move (str));
+ else
+ return Token::make_outer_doc_comment (loc, std::move (str));
+ }
+ else if (peek_input () == '*' && peek_input (1) == '*'
+ && peek_input (2) == '/')
+ {
+ /**/
+ skip_input (2);
+ current_column += 4;
+ continue;
+ }
+ else if (peek_input () == '*' && peek_input (1) == '*'
+ && peek_input (2) == '*' && peek_input (3) == '/')
+ {
+ /***/
+ skip_input (3);
+ current_column += 5;
+ continue;
+ }
+ else if ((peek_input () == '*' && peek_input (1) != '!'
+ && peek_input (1) != '*')
+ || (peek_input () == '*' && peek_input (1) == '*'
+ && peek_input (2) == '*'))
+ {
+ // one /* or three /***
+ // block comment
+ // (but not an inner or outer doc comment)
+ skip_input ();
+ current_column += 2;
+
+ int level = 1;
+ while (level > 0)
+ {
+ current_char = peek_input ();
+
+ if (current_char == EOF)
+ {
+ rust_error_at (
+ loc, "unexpected EOF while looking for end of comment");
+ break;
+ }
+
+ // if /* found
+ if (current_char == '/' && peek_input (1) == '*')
+ {
+ // skip /* characters
+ skip_input (1);
+
+ current_column += 2;
+
+ level += 1;
+ continue;
+ }
+
+ // ignore until */ is found
+ if (current_char == '*' && peek_input (1) == '/')
+ {
+ // skip */ characters
+ skip_input (1);
+
+ current_column += 2;
+
+ level -= 1;
+ continue;
+ }
+
+ if (current_char == '\n')
+ {
+ skip_input ();
+ current_line++;
+ current_column = 1;
+ // tell line_table that new line starts
+ start_line (current_line, max_column_hint);
+ continue;
+ }
+
+ skip_input ();
+ current_column++;
+ }
+
+ // refresh new token
+ continue;
+ }
+ else if (peek_input () == '*'
+ && (peek_input (1) == '!' || peek_input (1) == '*'))
+ {
+ // block doc comment, inner /*! or outer /**
+ bool is_inner = peek_input (1) == '!';
+ skip_input (1);
+ current_column += 3;
+
+ std::string str;
+ str.reserve (96);
+
+ int level = 1;
+ while (level > 0)
+ {
+ current_char = peek_input ();
+
+ if (current_char == EOF)
+ {
+ rust_error_at (
+ loc, "unexpected EOF while looking for end of comment");
+ break;
+ }
+
+ // if /* found
+ if (current_char == '/' && peek_input (1) == '*')
+ {
+ // skip /* characters
+ skip_input (1);
+ current_column += 2;
+
+ level += 1;
+ str += "/*";
+ continue;
+ }
+
+ // ignore until */ is found
+ if (current_char == '*' && peek_input (1) == '/')
+ {
+ // skip */ characters
+ skip_input (1);
+ current_column += 2;
+
+ level -= 1;
+ if (level > 0)
+ str += "*/";
+ continue;
+ }
+
+ if (current_char == '\r' && peek_input (1) != '\n')
+ rust_error_at (
+ loc, "Isolated CR %<\\r%> not allowed in doc comment");
+
+ if (current_char == '\n')
+ {
+ skip_input ();
+ current_line++;
+ current_column = 1;
+ // tell line_table that new line starts
+ start_line (current_line, max_column_hint);
+ str += '\n';
+ continue;
+ }
+
+ str += current_char;
+ skip_input ();
+ current_column++;
+ }
+
+ str.shrink_to_fit ();
+ if (is_inner)
+ return Token::make_inner_doc_comment (loc, std::move (str));
+ else
+ return Token::make_outer_doc_comment (loc, std::move (str));
+ }
+ else
+ {
+ // division
+ current_column++;
+ return Token::make (DIV, loc);
+ }
+ case '%':
+ if (peek_input () == '=')
+ {
+ // modulo-assign
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (PERCENT_EQ, loc);
+ }
+ else
+ {
+ // modulo
+ current_column++;
+ return Token::make (PERCENT, loc);
+ }
+ case '^':
+ if (peek_input () == '=')
+ {
+ // xor-assign?
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (CARET_EQ, loc);
+ }
+ else
+ {
+ // xor?
+ current_column++;
+ return Token::make (CARET, loc);
+ }
+ case '<':
+ if (peek_input () == '<')
+ {
+ if (peek_input (1) == '=')
+ {
+ // left-shift assign
+ skip_input (1);
+ current_column += 3;
+
+ return Token::make (LEFT_SHIFT_EQ, loc);
+ }
+ else
+ {
+ // left-shift
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (LEFT_SHIFT, loc);
+ }
+ }
+ else if (peek_input () == '=')
+ {
+ // smaller than or equal to
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (LESS_OR_EQUAL, loc);
+ }
+ else
+ {
+ // smaller than
+ current_column++;
+ return Token::make (LEFT_ANGLE, loc);
+ }
+ break;
+ case '>':
+ if (peek_input () == '>')
+ {
+ if (peek_input (1) == '=')
+ {
+ // right-shift-assign
+ skip_input (1);
+ current_column += 3;
+
+ return Token::make (RIGHT_SHIFT_EQ, loc);
+ }
+ else
+ {
+ // right-shift
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (RIGHT_SHIFT, loc);
+ }
+ }
+ else if (peek_input () == '=')
+ {
+ // larger than or equal to
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (GREATER_OR_EQUAL, loc);
+ }
+ else
+ {
+ // larger than
+ current_column++;
+ return Token::make (RIGHT_ANGLE, loc);
+ }
+ case ':':
+ if (peek_input () == ':')
+ {
+ // scope resolution ::
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (SCOPE_RESOLUTION, loc);
+ }
+ else
+ {
+ // single colon :
+ current_column++;
+ return Token::make (COLON, loc);
+ }
+ case '!':
+ // no special handling for macros in lexer?
+ if (peek_input () == '=')
+ {
+ // not equal boolean operator
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (NOT_EQUAL, loc);
+ }
+ else
+ {
+ // not equal unary operator
+ current_column++;
+
+ return Token::make (EXCLAM, loc);
+ }
+ case '?':
+ current_column++;
+ return Token::make (QUESTION_MARK, loc);
+ case '#':
+ current_column++;
+ return Token::make (HASH, loc);
+ case '[':
+ current_column++;
+ return Token::make (LEFT_SQUARE, loc);
+ case ']':
+ current_column++;
+ return Token::make (RIGHT_SQUARE, loc);
+ case '{':
+ current_column++;
+ return Token::make (LEFT_CURLY, loc);
+ case '}':
+ current_column++;
+ return Token::make (RIGHT_CURLY, loc);
+ case '@':
+ current_column++;
+ return Token::make (PATTERN_BIND, loc);
+ case '$':
+ current_column++;
+ return Token::make (DOLLAR_SIGN, loc);
+ case '~':
+ current_column++;
+ return Token::make (TILDE, loc);
+ case '\\':
+ current_column++;
+ return Token::make (BACKSLASH, loc);
+ case '`':
+ current_column++;
+ return Token::make (BACKTICK, loc);
+ case '|':
+ if (peek_input () == '=')
+ {
+ // bitwise or-assign?
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (PIPE_EQ, loc);
+ }
+ else if (peek_input () == '|')
+ {
+ // logical or
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (OR, loc);
+ }
+ else
+ {
+ // bitwise or
+ current_column++;
+
+ return Token::make (PIPE, loc);
+ }
+ case '&':
+ if (peek_input () == '=')
+ {
+ // bitwise and-assign?
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (AMP_EQ, loc);
+ }
+ else if (peek_input () == '&')
+ {
+ // logical and
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (LOGICAL_AND, loc);
+ }
+ else
+ {
+ // bitwise and/reference
+ current_column++;
+
+ return Token::make (AMP, loc);
+ }
+ case '.':
+ if (peek_input () == '.')
+ {
+ if (peek_input (1) == '.')
+ {
+ // ellipsis
+ skip_input (1);
+ current_column += 3;
+
+ return Token::make (ELLIPSIS, loc);
+ }
+ else if (peek_input (1) == '=')
+ {
+ // ..=
+ skip_input (1);
+ current_column += 3;
+
+ return Token::make (DOT_DOT_EQ, loc);
+ }
+ else
+ {
+ // ..
+ skip_input ();
+ current_column += 2;
+
+ return Token::make (DOT_DOT, loc);
+ }
+ }
+ else /*if (!ISDIGIT (peek_input ()))*/
+ {
+ // single dot .
+ // Only if followed by a non-number - otherwise is float
+ // nope, float cannot start with '.'.
+ current_column++;
+ return Token::make (DOT, loc);
+ }
+ }
+ // TODO: special handling of _ in the lexer? instead of being identifier
+
+ // byte character, byte string and raw byte string literals
+ if (current_char == 'b')
+ {
+ if (peek_input () == '\'')
+ return parse_byte_char (loc);
+ else if (peek_input () == '"')
+ return parse_byte_string (loc);
+ else if (peek_input () == 'r'
+ && (peek_input (1) == '#' || peek_input (1) == '"'))
+ return parse_raw_byte_string (loc);
+ }
+
+ // raw identifiers and raw strings
+ if (current_char == 'r')
+ {
+ int peek = peek_input ();
+ int peek1 = peek_input (1);
+
+ if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
+ {
+ TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
+ if (raw_ident_ptr != nullptr)
+ return raw_ident_ptr;
+ else
+ continue; /* input got parsed, it just wasn't valid. An error
+ was produced. */
+ }
+ else
+ {
+ TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc);
+ if (maybe_raw_string_ptr != nullptr)
+ return maybe_raw_string_ptr;
+ }
+ }
+
+ // find identifiers and keywords
+ if (ISALPHA (current_char) || current_char == '_')
+ return parse_identifier_or_keyword (loc);
+
+ // int and float literals
+ if (ISDIGIT (current_char))
+ { // _ not allowed as first char
+ if (current_char == '0'
+ && is_non_decimal_int_literal_separator (peek_input ()))
+ {
+ // handle binary, octal, hex literals
+ TokenPtr non_dec_int_lit_ptr
+ = parse_non_decimal_int_literals (loc);
+ if (non_dec_int_lit_ptr != nullptr)
+ return non_dec_int_lit_ptr;
+ }
+ else
+ {
+ // handle decimals (integer or float)
+ TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc);
+ if (decimal_or_float_ptr != nullptr)
+ return decimal_or_float_ptr;
+ }
+ }
+
+ // string literals
+ if (current_char == '"')
+ return parse_string (loc);
+
+ // char literals and lifetime names
+ if (current_char == '\'')
+ {
+ TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc);
+ if (char_or_lifetime_ptr != nullptr)
+ return char_or_lifetime_ptr;
+ }
+
+ // DEBUG: check for specific character problems:
+ if (current_char == '0')
+ rust_debug ("'0' uncaught before unexpected character");
+ else if (current_char == ']')
+ rust_debug ("']' uncaught before unexpected character");
+ else if (current_char == 0x5d)
+ rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before "
+ "unexpected character");
+
+ // didn't match anything so error
+ rust_error_at (loc, "unexpected character %<%x%>", current_char);
+ current_column++;
+ }
+}
+
+// Parses in a type suffix.
+std::pair<PrimitiveCoreType, int>
+Lexer::parse_in_type_suffix ()
+{
+ std::string suffix;
+ suffix.reserve (5);
+
+ int additional_length_offset = 0;
+
+ // get suffix
+ while (ISALPHA (current_char) || ISDIGIT (current_char)
+ || current_char == '_')
+ {
+ if (current_char == '_')
+ {
+ // don't add _ to suffix
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+
+ continue;
+ }
+
+ additional_length_offset++;
+
+ suffix += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
+
+ if (suffix.empty ())
+ {
+ // no type suffix: do nothing but also no error
+ return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
+ }
+ else if (suffix == "f32")
+ {
+ return std::make_pair (CORETYPE_F32, additional_length_offset);
+ }
+ else if (suffix == "f64")
+ {
+ return std::make_pair (CORETYPE_F64, additional_length_offset);
+ }
+ else if (suffix == "i8")
+ {
+ return std::make_pair (CORETYPE_I8, additional_length_offset);
+ }
+ else if (suffix == "i16")
+ {
+ return std::make_pair (CORETYPE_I16, additional_length_offset);
+ }
+ else if (suffix == "i32")
+ {
+ return std::make_pair (CORETYPE_I32, additional_length_offset);
+ }
+ else if (suffix == "i64")
+ {
+ return std::make_pair (CORETYPE_I64, additional_length_offset);
+ }
+ else if (suffix == "i128")
+ {
+ return std::make_pair (CORETYPE_I128, additional_length_offset);
+ }
+ else if (suffix == "isize")
+ {
+ return std::make_pair (CORETYPE_ISIZE, additional_length_offset);
+ }
+ else if (suffix == "u8")
+ {
+ return std::make_pair (CORETYPE_U8, additional_length_offset);
+ }
+ else if (suffix == "u16")
+ {
+ return std::make_pair (CORETYPE_U16, additional_length_offset);
+ }
+ else if (suffix == "u32")
+ {
+ return std::make_pair (CORETYPE_U32, additional_length_offset);
+ }
+ else if (suffix == "u64")
+ {
+ return std::make_pair (CORETYPE_U64, additional_length_offset);
+ }
+ else if (suffix == "u128")
+ {
+ return std::make_pair (CORETYPE_U128, additional_length_offset);
+ }
+ else if (suffix == "usize")
+ {
+ return std::make_pair (CORETYPE_USIZE, additional_length_offset);
+ }
+ else
+ {
+ rust_error_at (get_current_location (), "unknown number suffix %qs",
+ suffix.c_str ());
+
+ return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset);
+ }
+}
+
+// Parses in the exponent part (if any) of a float literal.
+std::pair<std::string, int>
+Lexer::parse_in_exponent_part ()
+{
+ int additional_length_offset = 0;
+ std::string str;
+ if (current_char == 'E' || current_char == 'e')
+ {
+ // add exponent to string as strtod works with it
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+
+ // special - and + handling
+ if (current_char == '-')
+ {
+ str += '-';
+
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+ }
+ else if (current_char == '+')
+ {
+ // don't add + but still skip input
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+ }
+
+ // parse another decimal number for exponent
+ auto str_length = parse_in_decimal ();
+ str += std::get<0> (str_length);
+ additional_length_offset += std::get<1> (str_length);
+ }
+ return std::make_pair (str, additional_length_offset);
+}
+
+// Parses a decimal integer.
+std::tuple<std::string, int, bool>
+Lexer::parse_in_decimal ()
+{
+ /* A pure decimal contains only digits. */
+ bool pure_decimal = true;
+ int additional_length_offset = 0;
+ std::string str;
+ while (ISDIGIT (current_char) || current_char == '_')
+ {
+ if (current_char == '_')
+ {
+ pure_decimal = false;
+ // don't add _ to number
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+
+ continue;
+ }
+
+ additional_length_offset++;
+
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
+ return std::make_tuple (str, additional_length_offset, pure_decimal);
+}
+
+/* Parses escapes (and string continues) in "byte" strings and characters. Does
+ * not support unicode. */
+std::tuple<char, int, bool>
+Lexer::parse_escape (char opening_char)
+{
+ int additional_length_offset = 0;
+ char output_char = 0;
+
+ // skip to actual letter
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+
+ switch (current_char)
+ {
+ case 'x': {
+ auto hex_escape_pair = parse_partial_hex_escape ();
+ long hexLong = hex_escape_pair.first;
+ additional_length_offset += hex_escape_pair.second;
+
+ if (hexLong > 255 || hexLong < 0)
+ rust_error_at (
+ get_current_location (),
+ "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>",
+ static_cast<unsigned int> (hexLong));
+ /* TODO: restore capital for escape output - gcc pretty-printer doesn't
+ * support %X directly */
+ char hexChar = static_cast<char> (hexLong);
+
+ output_char = hexChar;
+ }
+ break;
+ case 'n':
+ output_char = '\n';
+ break;
+ case 'r':
+ output_char = '\r';
+ break;
+ case 't':
+ output_char = '\t';
+ break;
+ case '\\':
+ output_char = '\\';
+ break;
+ case '0':
+ output_char = '\0';
+ break;
+ case '\'':
+ output_char = '\'';
+ break;
+ case '"':
+ output_char = '"';
+ break;
+ case 'u':
+ rust_error_at (get_current_location (),
+ "cannot have a unicode escape \\u in a byte %s",
+ opening_char == '\'' ? "character" : "string");
+ // Try to parse it anyway, just to skip it
+ parse_partial_unicode_escape ();
+ return std::make_tuple (output_char, additional_length_offset, false);
+ case '\r':
+ case '\n':
+ // string continue
+ return std::make_tuple (0, parse_partial_string_continue (), true);
+ default:
+ rust_error_at (get_current_location (),
+ "unknown escape sequence %<\\%c%>", current_char);
+ // returns false if no parsing could be done
+ // return false;
+ return std::make_tuple (output_char, additional_length_offset, false);
+ break;
+ }
+ // all non-special cases (string continue) should skip their used char
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+
+ // returns true if parsing was successful
+ // return true;
+ return std::make_tuple (output_char, additional_length_offset, false);
+}
+
+/* Parses an escape (or string continue) in a string or character. Supports
+ * unicode escapes. */
+std::tuple<Codepoint, int, bool>
+Lexer::parse_utf8_escape (char opening_char)
+{
+ Codepoint output_char;
+ int additional_length_offset = 0;
+
+ // skip to actual letter
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+
+ switch (current_char)
+ {
+ case 'x': {
+ auto hex_escape_pair = parse_partial_hex_escape ();
+ long hexLong = hex_escape_pair.first;
+ additional_length_offset += hex_escape_pair.second;
+
+ if (hexLong > 127 || hexLong < 0)
+ rust_error_at (
+ get_current_location (),
+ "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>",
+ static_cast<unsigned int> (hexLong));
+ /* TODO: restore capital for escape output - gcc pretty-printer doesn't
+ * support %X directly */
+ char hexChar = static_cast<char> (hexLong);
+
+ output_char = hexChar;
+ }
+ break;
+ case 'n':
+ output_char = '\n';
+ break;
+ case 'r':
+ output_char = '\r';
+ break;
+ case 't':
+ output_char = '\t';
+ break;
+ case '\\':
+ output_char = '\\';
+ break;
+ case '0':
+ output_char = '\0';
+ break;
+ case '\'':
+ output_char = '\'';
+ break;
+ case '"':
+ output_char = '"';
+ break;
+ case 'u': {
+ auto unicode_escape_pair = parse_partial_unicode_escape ();
+ output_char = unicode_escape_pair.first;
+ additional_length_offset += unicode_escape_pair.second;
+
+ return std::make_tuple (output_char, additional_length_offset, false);
+ }
+ break;
+ case '\r':
+ case '\n':
+ // string continue
+ return std::make_tuple (0, parse_partial_string_continue (), true);
+ default:
+ rust_error_at (get_current_location (),
+ "unknown escape sequence %<\\%c%>", current_char);
+ // returns false if no parsing could be done
+ // return false;
+ return std::make_tuple (output_char, additional_length_offset, false);
+ break;
+ }
+ /* all non-special cases (unicode, string continue) should skip their used
+ * char */
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+
+ // returns true if parsing was successful
+ // return true;
+ return std::make_tuple (output_char, additional_length_offset, false);
+}
+
+// Parses the body of a string continue that has been found in an escape.
+int
+Lexer::parse_partial_string_continue ()
+{
+ int additional_length_offset = 1;
+
+ // string continue
+ while (is_whitespace (current_char))
+ {
+ if (current_char == '\n')
+ {
+ current_line++;
+ current_column = 1;
+ // tell line_table that new line starts
+ start_line (current_line, max_column_hint);
+
+ // reset "length"
+ additional_length_offset = 1;
+
+ // get next char
+ skip_input ();
+ current_char = peek_input ();
+
+ continue;
+ }
+
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ }
+
+ return additional_length_offset;
+}
+
+/* Parses the body of a '\x' escape. Note that it does not check that the number
+ * is valid and smaller than 255. */
+std::pair<long, int>
+Lexer::parse_partial_hex_escape ()
+{
+ // hex char string (null-terminated)
+ char hexNum[3] = {0, 0, 0};
+
+ // first hex char
+ current_char = peek_input (1);
+ int additional_length_offset = 1;
+
+ if (!is_x_digit (current_char))
+ {
+ rust_error_at (get_current_location (),
+ "invalid character %<\\x%c%> in \\x sequence",
+ current_char);
+ return std::make_pair (0, 0);
+ }
+ hexNum[0] = current_char;
+
+ // second hex char
+ skip_input ();
+ current_char = peek_input (1);
+ additional_length_offset++;
+
+ if (!is_x_digit (current_char))
+ {
+ rust_error_at (get_current_location (),
+ "invalid character %<\\x%c%c%> in \\x sequence", hexNum[0],
+ current_char);
+ return std::make_pair (0, 1);
+ }
+ skip_input ();
+ hexNum[1] = current_char;
+
+ long hexLong = std::strtol (hexNum, nullptr, 16);
+
+ return std::make_pair (hexLong, additional_length_offset);
+}
+
+// Parses the body of a unicode escape.
+std::pair<Codepoint, int>
+Lexer::parse_partial_unicode_escape ()
+{
+ skip_input ();
+ current_char = peek_input ();
+ int additional_length_offset = 0;
+
+ if (current_char != '{')
+ {
+ rust_error_at (get_current_location (),
+ "unicode escape should start with %<{%>");
+ /* Skip what should probaby have been between brackets. */
+ while (is_x_digit (current_char) || current_char == '_')
+ {
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ }
+ return std::make_pair (Codepoint (0), additional_length_offset);
+ }
+
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+
+ if (current_char == '_')
+ {
+ rust_error_at (get_current_location (),
+ "unicode escape cannot start with %<_%>");
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ // fallthrough and try to parse the rest anyway
+ }
+
+ // parse unicode escape - 1-6 hex digits
+ std::string num_str;
+ num_str.reserve (6);
+
+ // loop through to add entire hex number to string
+ while (is_x_digit (current_char) || current_char == '_')
+ {
+ if (current_char == '_')
+ {
+ // don't add _ to number
+ skip_input ();
+ current_char = peek_input ();
+
+ additional_length_offset++;
+
+ continue;
+ }
+
+ additional_length_offset++;
+
+ // add raw hex numbers
+ num_str += current_char;
+
+ skip_input ();
+ current_char = peek_input ();
+ }
+
+ if (current_char == '}')
+ {
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ }
+ else
+ {
+ // actually an error, but allow propagation anyway Assume that
+ // wrong bracketm whitespace or single/double quotes are wrong
+ // termination, otherwise it is a wrong character, then skip to the actual
+ // terminator.
+ if (current_char == '{' || is_whitespace (current_char)
+ || current_char == '\'' || current_char == '"')
+ {
+ rust_error_at (get_current_location (),
+ "expected terminating %<}%> in unicode escape");
+ return std::make_pair (Codepoint (0), additional_length_offset);
+ }
+ else
+ {
+ rust_error_at (get_current_location (),
+ "invalid character %<%c%> in unicode escape",
+ current_char);
+ while (current_char != '}' && current_char != '{'
+ && !is_whitespace (current_char) && current_char != '\''
+ && current_char != '"')
+ {
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ }
+ // Consume the actual closing bracket if found
+ if (current_char == '}')
+ {
+ skip_input ();
+ current_char = peek_input ();
+ additional_length_offset++;
+ }
+ return std::make_pair (Codepoint (0), additional_length_offset);
+ }
+ }
+
+ // ensure 1-6 hex characters
+ if (num_str.length () > 6 || num_str.length () < 1)
+ {
+ rust_error_at (get_current_location (),
+ "unicode escape should be between 1 and 6 hex "
+ "characters; it is %lu",
+ (unsigned long) num_str.length ());
+ // return false;
+ return std::make_pair (Codepoint (0), additional_length_offset);
+ }
+
+ unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16);
+
+ if (hex_num > 0xd7ff && hex_num < 0xe000)
+ {
+ rust_error_at (
+ get_current_location (),
+ "unicode escape cannot be a surrogate value (D800 to DFFF)");
+ return std::make_pair (Codepoint (0), additional_length_offset);
+ }
+
+ if (hex_num > 0x10ffff)
+ {
+ rust_error_at (get_current_location (),
+ "unicode escape cannot be larger than 10FFFF");
+ return std::make_pair (Codepoint (0), additional_length_offset);
+ }
+
+ // return true;
+ return std::make_pair (Codepoint (static_cast<uint32_t> (hex_num)),
+ additional_length_offset);
+}
+
+// Parses a byte character.
+TokenPtr
+Lexer::parse_byte_char (Location loc)
+{
+ skip_input ();
+ current_column++;
+ // make current char the next character
+ current_char = peek_input ();
+
+ int length = 1;
+
+ // char to save
+ char byte_char = 0;
+
+ // detect escapes
+ if (current_char == '\\')
+ {
+ auto escape_length_pair = parse_escape ('\'');
+ byte_char = std::get<0> (escape_length_pair);
+ length += std::get<1> (escape_length_pair);
+
+ current_char = peek_input ();
+
+ if (current_char != '\'')
+ {
+ rust_error_at (get_current_location (), "unclosed %<byte char%>");
+ }
+
+ skip_input ();
+ current_char = peek_input ();
+ length++; // go to next char
+ }
+ else if (current_char != '\'')
+ {
+ // otherwise, get character from direct input character
+ byte_char = current_char;
+
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+
+ if (current_char != '\'')
+ {
+ rust_error_at (get_current_location (), "unclosed %<byte char%>");
+ }
+
+ skip_input ();
+ current_char = peek_input ();
+ length++; // go to next char
+ }
+ else
+ {
+ rust_error_at (get_current_location (),
+ "no character inside %<%> for %<byte char%>");
+ }
+
+ current_column += length;
+
+ return Token::make_byte_char (loc, byte_char);
+}
+
+// Parses a byte string.
+TokenPtr
+Lexer::parse_byte_string (Location loc)
+{
+ // byte string
+
+ // skip quote character
+ skip_input ();
+ current_column++;
+
+ std::string str;
+ str.reserve (16); // some sensible default
+
+ int length = 1;
+ current_char = peek_input ();
+
+ while (current_char != '"' && current_char != EOF)
+ {
+ if (current_char == '\\')
+ {
+ auto escape_length_pair = parse_escape ('"');
+ char output_char = std::get<0> (escape_length_pair);
+
+ if (output_char == 0 && std::get<2> (escape_length_pair))
+ length = std::get<1> (escape_length_pair) - 1;
+ else
+ length += std::get<1> (escape_length_pair);
+
+ if (output_char != 0 || !std::get<2> (escape_length_pair))
+ str += output_char;
+
+ continue;
+ }
+
+ length++;
+
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
+
+ current_column += length;
+
+ if (current_char == '"')
+ {
+ current_column++;
+
+ skip_input ();
+ current_char = peek_input ();
+ }
+ else if (current_char == EOF)
+ {
+ rust_error_at (get_current_location (), "unended byte string literal");
+ return Token::make (END_OF_FILE, get_current_location ());
+ }
+ else
+ {
+ gcc_unreachable ();
+ }
+
+ str.shrink_to_fit ();
+
+ return Token::make_byte_string (loc, std::move (str));
+}
+
+// Parses a raw byte string.
+TokenPtr
+Lexer::parse_raw_byte_string (Location loc)
+{
+ // raw byte string literals
+ std::string str;
+ str.reserve (16); // some sensible default
+
+ int length = 1;
+ int hash_count = 0;
+
+ // get hash count at beginnning
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+ while (current_char == '#')
+ {
+ hash_count++;
+ length++;
+
+ skip_input ();
+ current_char = peek_input ();
+ }
+
+ if (current_char != '"')
+ {
+ rust_error_at (get_current_location (),
+ "raw byte string has no opening %<\"%>");
+ }
+
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+
+ while (true)
+ {
+ if (current_char == '"')
+ {
+ bool enough_hashes = true;
+
+ for (int i = 0; i < hash_count; i++)
+ {
+ if (peek_input (i + 1) != '#')
+ {
+ enough_hashes = false;
+ break;
+ }
+ }
+
+ if (enough_hashes)
+ {
+ // skip enough input and peek enough input
+ skip_input (hash_count);
+ current_char = peek_input ();
+ length += hash_count + 1;
+ break;
+ }
+ }
+
+ if ((unsigned char) current_char > 127)
+ {
+ rust_error_at (get_current_location (),
+ "character %<%c%> in raw byte string out of range",
+ current_char);
+ current_char = 0;
+ }
+
+ length++;
+
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+
+ return Token::make_byte_string (loc, std::move (str));
+}
+
+// Parses a raw identifier.
+TokenPtr
+Lexer::parse_raw_identifier (Location loc)
+{
+ // raw identifier
+ std::string str;
+ str.reserve (16); // default
+
+ skip_input ();
+ current_char = peek_input ();
+
+ current_column += 2;
+
+ bool first_is_underscore = current_char == '_';
+
+ int length = 0;
+ current_char = peek_input ();
+ // loop through entire name
+ while (ISALPHA (current_char) || ISDIGIT (current_char)
+ || current_char == '_')
+ {
+ length++;
+
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
+
+ current_column += length;
+
+ // if just a single underscore, not an identifier
+ if (first_is_underscore && length == 1)
+ rust_error_at (get_current_location (),
+ "%<_%> is not a valid raw identifier");
+
+ if (str == "crate" || str == "extern" || str == "self" || str == "super"
+ || str == "Self")
+ {
+ rust_error_at (get_current_location (),
+ "%qs is a forbidden raw identifier", str.c_str ());
+
+ return nullptr;
+ }
+ else
+ {
+ str.shrink_to_fit ();
+
+ return Token::make_identifier (loc, std::move (str));
+ }
+}
+
+// skip broken string input (unterminated strings)
+void
+Lexer::skip_broken_string_input (int current_char)
+{
+ while (current_char != '"' && current_char != EOF)
+ {
+ if (current_char == '\n')
+ {
+ current_line++;
+ current_column = 1;
+ }
+ else
+ {
+ current_column++;
+ }
+ skip_input ();
+ current_char = peek_input ();
+ }
+ if (current_char == '"')
+ {
+ current_column++;
+
+ skip_input ();
+ current_char = peek_input ();
+ }
+ rust_debug ("skipped to %d:%d due to bad quotes", current_line,
+ current_column);
+}
+
+// Parses a unicode string.
+TokenPtr
+Lexer::parse_string (Location loc)
+{
+ Codepoint current_char32;
+
+ std::string str;
+ str.reserve (16); // some sensible default
+
+ int length = 1;
+ current_char32 = peek_codepoint_input ();
+
+ // FIXME: This fails if the input ends. How do we check for EOF?
+ while (current_char32.value != '"' && !current_char32.is_eof ())
+ {
+ if (current_char32.value == '\\')
+ {
+ // parse escape
+ auto utf8_escape_pair = parse_utf8_escape ('\'');
+ current_char32 = std::get<0> (utf8_escape_pair);
+
+ if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair))
+ length = std::get<1> (utf8_escape_pair) - 1;
+ else
+ length += std::get<1> (utf8_escape_pair);
+
+ if (current_char32 != Codepoint (0)
+ || !std::get<2> (utf8_escape_pair))
+ str += current_char32;
+
+ // required as parsing utf8 escape only changes current_char
+ current_char32 = peek_codepoint_input ();
+
+ continue;
+ }
+
+ length += get_input_codepoint_length ();
+
+ str += current_char32;
+ skip_codepoint_input ();
+ current_char32 = peek_codepoint_input ();
+ }
+
+ current_column += length;
+
+ if (current_char32.value == '"')
+ {
+ current_column++;
+
+ skip_input ();
+ current_char = peek_input ();
+ }
+ else if (current_char32.is_eof ())
+ {
+ rust_error_at (get_current_location (), "unended string literal");
+ return Token::make (END_OF_FILE, get_current_location ());
+ }
+ else
+ {
+ gcc_unreachable ();
+ }
+
+ str.shrink_to_fit ();
+ return Token::make_string (loc, std::move (str));
+}
+
+// Parses an identifier or keyword.
+TokenPtr
+Lexer::parse_identifier_or_keyword (Location loc)
+{
+ std::string str;
+ str.reserve (16); // default
+ str += current_char;
+
+ bool first_is_underscore = current_char == '_';
+
+ int length = 1;
+ current_char = peek_input ();
+ // loop through entire name
+ while (ISALPHA (current_char) || ISDIGIT (current_char)
+ || current_char == '_')
+ {
+ length++;
+
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
+
+ current_column += length;
+
+ // if just a single underscore, not an identifier
+ if (first_is_underscore && length == 1)
+ return Token::make (UNDERSCORE, loc);
+
+ str.shrink_to_fit ();
+
+ TokenId keyword = classify_keyword (str);
+ if (keyword == IDENTIFIER)
+ return Token::make_identifier (loc, std::move (str));
+ else
+ return Token::make (keyword, loc);
+}
+
+// Possibly returns a raw string token if it exists - otherwise returns null.
+TokenPtr
+Lexer::maybe_parse_raw_string (Location loc)
+{
+ int peek_index = 0;
+ while (peek_input (peek_index) == '#')
+ peek_index++;
+
+ if (peek_input (peek_index) == '"')
+ return parse_raw_string (loc, peek_index);
+ else
+ return nullptr;
+}
+
+// Returns a raw string token.
+TokenPtr
+Lexer::parse_raw_string (Location loc, int initial_hash_count)
+{
+ // raw string literals
+ std::string str;
+ str.reserve (16); // some sensible default
+
+ int length = 1 + initial_hash_count;
+
+ if (initial_hash_count > 0)
+ skip_input (initial_hash_count - 1);
+
+ current_char = peek_input ();
+
+ if (current_char != '"')
+ rust_error_at (get_current_location (), "raw string has no opening %<\"%>");
+
+ length++;
+ skip_input ();
+ Codepoint current_char32 = peek_codepoint_input ();
+
+ while (!current_char32.is_eof ())
+ {
+ if (current_char32.value == '"')
+ {
+ bool enough_hashes = true;
+
+ for (int i = 0; i < initial_hash_count; i++)
+ {
+ if (peek_input (i + 1) != '#')
+ {
+ enough_hashes = false;
+ break;
+ }
+ }
+
+ if (enough_hashes)
+ {
+ // skip enough input and peek enough input
+ skip_input (initial_hash_count);
+ current_char = peek_input ();
+ length += initial_hash_count + 1;
+ break;
+ }
+ }
+
+ length++;
+
+ str += current_char32;
+ skip_codepoint_input ();
+ current_char32 = peek_codepoint_input ();
+ }
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+
+ return Token::make_string (loc, std::move (str));
+}
+
+template <typename IsDigitFunc>
+TokenPtr
+Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func,
+ std::string existent_str, int base)
+{
+ int length = 1;
+
+ skip_input ();
+ current_char = peek_input ();
+
+ length++;
+
+ // loop through to add entire number to string
+ while (is_digit_func (current_char) || current_char == '_')
+ {
+ if (current_char == '_')
+ {
+ // don't add _ to number
+ skip_input ();
+ current_char = peek_input ();
+
+ length++;
+
+ continue;
+ }
+
+ length++;
+
+ // add raw numbers
+ existent_str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ }
+
+ // convert value to decimal representation
+ long dec_num = std::strtol (existent_str.c_str (), nullptr, base);
+
+ existent_str = std::to_string (dec_num);
+
+ // parse in type suffix if it exists
+ auto type_suffix_pair = parse_in_type_suffix ();
+ PrimitiveCoreType type_hint = type_suffix_pair.first;
+ length += type_suffix_pair.second;
+
+ current_column += length;
+
+ if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64)
+ {
+ rust_error_at (get_current_location (),
+ "invalid type suffix %qs for integer (%s) literal",
+ get_type_hint_string (type_hint),
+ base == 16
+ ? "hex"
+ : (base == 8 ? "octal"
+ : (base == 2 ? "binary"
+ : "<insert unknown base>")));
+ return nullptr;
+ }
+ return Token::make_int (loc, std::move (existent_str), type_hint);
+}
+
+// Parses a hex, binary or octal int literal.
+TokenPtr
+Lexer::parse_non_decimal_int_literals (Location loc)
+{
+ std::string str;
+ str.reserve (16); // some sensible default
+ str += current_char;
+
+ current_char = peek_input ();
+
+ if (current_char == 'x')
+ {
+ // hex (integer only)
+ return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16);
+ }
+ else if (current_char == 'o')
+ {
+ // octal (integer only)
+ return parse_non_decimal_int_literal (loc, is_octal_digit,
+ std::move (str), 8);
+ }
+ else if (current_char == 'b')
+ {
+ // binary (integer only)
+ return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str),
+ 2);
+ }
+ else
+ {
+ return nullptr;
+ }
+}
+
+// Parses a decimal-based int literal or float literal.
+TokenPtr
+Lexer::parse_decimal_int_or_float (Location loc)
+{
+ std::string str;
+ str.reserve (16); // some sensible default
+ str += current_char;
+
+ int length = 1;
+ bool first_zero = current_char == '0';
+
+ current_char = peek_input ();
+
+ // parse initial decimal integer (or first integer part of float) literal
+ auto initial_decimal = parse_in_decimal ();
+ str += std::get<0> (initial_decimal);
+ length += std::get<1> (initial_decimal);
+
+ // detect float literal
+ if (current_char == '.' && is_float_digit (peek_input (1)))
+ {
+ // float with a '.', parse another decimal into it
+
+ // add . to str
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+
+ // parse another decimal number for float
+ auto second_decimal = parse_in_decimal ();
+ str += std::get<0> (second_decimal);
+ length += std::get<1> (second_decimal);
+
+ // parse in exponent part if it exists
+ auto exponent_pair = parse_in_exponent_part ();
+ str += exponent_pair.first;
+ length += exponent_pair.second;
+
+ // parse in type suffix if it exists
+ auto type_suffix_pair = parse_in_type_suffix ();
+ PrimitiveCoreType type_hint = type_suffix_pair.first;
+ length += type_suffix_pair.second;
+
+ if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
+ && type_hint != CORETYPE_UNKNOWN)
+ {
+ rust_error_at (get_current_location (),
+ "invalid type suffix %qs for floating-point literal",
+ get_type_hint_string (type_hint));
+ // ignore invalid type suffix as everything else seems fine
+ type_hint = CORETYPE_UNKNOWN;
+ }
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+ return Token::make_float (loc, std::move (str), type_hint);
+ }
+ else if (current_char == '.' && check_valid_float_dot_end (peek_input (1)))
+ {
+ // float that is just an integer with a terminating '.' character
+
+ // add . to str
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+
+ // add a '0' after the . to prevent ambiguity
+ str += '0';
+
+ // type hint not allowed
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+ return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN);
+ }
+ else if (current_char == 'E' || current_char == 'e')
+ {
+ // exponent float with no '.' character
+
+ // parse exponent part
+ auto exponent_pair = parse_in_exponent_part ();
+ str += exponent_pair.first;
+ length += exponent_pair.second;
+
+ // parse in type suffix if it exists
+ auto type_suffix_pair = parse_in_type_suffix ();
+ PrimitiveCoreType type_hint = type_suffix_pair.first;
+ length += type_suffix_pair.second;
+
+ if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64
+ && type_hint != CORETYPE_UNKNOWN)
+ {
+ rust_error_at (get_current_location (),
+ "invalid type suffix %qs for floating-point literal",
+ get_type_hint_string (type_hint));
+ // ignore invalid type suffix as everything else seems fine
+ type_hint = CORETYPE_UNKNOWN;
+ }
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+ return Token::make_float (loc, std::move (str), type_hint);
+ }
+ else
+ {
+ // is an integer
+
+ // parse in type suffix if it exists
+ auto type_suffix_pair = parse_in_type_suffix ();
+ PrimitiveCoreType type_hint = type_suffix_pair.first;
+ /* A "real" pure decimal doesn't have a suffix and no zero prefix. */
+ if (type_hint == CORETYPE_UNKNOWN)
+ {
+ bool pure_decimal = std::get<2> (initial_decimal);
+ if (pure_decimal && (!first_zero || str.size () == 1))
+ type_hint = CORETYPE_PURE_DECIMAL;
+ }
+ length += type_suffix_pair.second;
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+ return Token::make_int (loc, std::move (str), type_hint);
+ }
+}
+
+TokenPtr
+Lexer::parse_char_or_lifetime (Location loc)
+{
+ Codepoint current_char32;
+
+ int length = 1;
+
+ current_char32 = peek_codepoint_input ();
+ if (current_char32.is_eof ())
+ return nullptr;
+
+ // parse escaped char literal
+ if (current_char32.value == '\\')
+ {
+ // parse escape
+ auto utf8_escape_pair = parse_utf8_escape ('\'');
+ current_char32 = std::get<0> (utf8_escape_pair);
+ length += std::get<1> (utf8_escape_pair);
+
+ if (peek_codepoint_input ().value != '\'')
+ {
+ rust_error_at (get_current_location (), "unended character literal");
+ }
+ else
+ {
+ skip_codepoint_input ();
+ current_char = peek_input ();
+ length++;
+ }
+
+ current_column += length;
+
+ return Token::make_char (loc, current_char32);
+ }
+ else
+ {
+ skip_codepoint_input ();
+
+ if (peek_codepoint_input ().value == '\'')
+ {
+ // parse non-escaped char literal
+
+ // skip the ' character
+ skip_input ();
+ current_char = peek_input ();
+
+ // TODO fix due to different widths of utf-8 chars?
+ current_column += 3;
+
+ return Token::make_char (loc, current_char32);
+ }
+ else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
+ || current_char32.value == '_')
+ {
+ // parse lifetime name
+ std::string str;
+ str += current_char32;
+ length++;
+
+ current_char = peek_input ();
+ while (ISDIGIT (current_char) || ISALPHA (current_char)
+ || current_char == '_')
+ {
+ str += current_char;
+ skip_input ();
+ current_char = peek_input ();
+ length++;
+ }
+
+ current_column += length;
+
+ str.shrink_to_fit ();
+ return Token::make_lifetime (loc, std::move (str));
+ }
+ else
+ {
+ rust_error_at (
+ get_current_location (),
+ "expected %' after character constant in character literal");
+ return nullptr;
+ }
+ }
+}
+
+// Returns the length of the codepoint at the current position.
+int
+Lexer::get_input_codepoint_length ()
+{
+ uint8_t input = peek_input ();
+
+ if ((int8_t) input == EOF)
+ return 0;
+
+ if (input < 128)
+ {
+ // ascii -- 1 byte
+ // return input;
+
+ return 1;
+ }
+ else if ((input & 0xC0) == 0x80)
+ {
+ // invalid (continuation; can't be first char)
+ // return 0xFFFE;
+
+ return 0;
+ }
+ else if ((input & 0xE0) == 0xC0)
+ {
+ // 2 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+ // return output;
+ return 2;
+ }
+ else if ((input & 0xF0) == 0xE0)
+ {
+ // 3 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ uint8_t input3 = peek_input (2);
+ if ((input3 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ /*uint32_t output
+ = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
+ 0); return output;*/
+ return 3;
+ }
+ else if ((input & 0xF8) == 0xF0)
+ {
+ // 4 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ uint8_t input3 = peek_input (2);
+ if ((input3 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ uint8_t input4 = peek_input (3);
+ if ((input4 & 0xC0) != 0x80)
+ return 0;
+ // return 0xFFFE;
+
+ /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+ | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+ return output;*/
+ return 4;
+ }
+ else
+ {
+ rust_error_at (get_current_location (),
+ "invalid UTF-8 [FIRST] (too long)");
+ return 0;
+ }
+}
+
+// Returns the codepoint at the current position.
+Codepoint
+Lexer::peek_codepoint_input ()
+{
+ uint8_t input = peek_input ();
+
+ if ((int8_t) input == EOF)
+ return Codepoint::eof ();
+
+ if (input < 128)
+ {
+ // ascii -- 1 byte
+ return {input};
+ }
+ else if ((input & 0xC0) == 0x80)
+ {
+ // invalid (continuation; can't be first char)
+ return {0xFFFE};
+ }
+ else if ((input & 0xE0) == 0xC0)
+ {
+ // 2 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+ return {output};
+ }
+ else if ((input & 0xF0) == 0xE0)
+ {
+ // 3 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint8_t input3 = peek_input (2);
+ if ((input3 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
+ | ((input3 & 0x3F) << 0);
+ return {output};
+ }
+ else if ((input & 0xF8) == 0xF0)
+ {
+ // 4 bytes
+ uint8_t input2 = peek_input (1);
+ if ((input2 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint8_t input3 = peek_input (2);
+ if ((input3 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint8_t input4 = peek_input (3);
+ if ((input4 & 0xC0) != 0x80)
+ return {0xFFFE};
+
+ uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+ | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+ return {output};
+ }
+ else
+ {
+ rust_error_at (get_current_location (),
+ "invalid UTF-8 [SECND] (too long)");
+ return {0xFFFE};
+ }
+}
+
+void
+Lexer::skip_codepoint_input ()
+{
+ int toSkip = get_input_codepoint_length ();
+ gcc_assert (toSkip >= 1);
+
+ skip_input (toSkip - 1);
+}
+
+int
+Lexer::test_get_input_codepoint_n_length (int n_start_offset)
+{
+ uint8_t input = peek_input (n_start_offset);
+
+ if (input < 128)
+ {
+ // ascii -- 1 byte
+ // return input;
+ return 1;
+ }
+ else if ((input & 0xC0) == 0x80)
+ {
+ // invalid (continuation; can't be first char)
+ // return 0xFFFE;
+ return 0;
+ }
+ else if ((input & 0xE0) == 0xC0)
+ {
+ // 2 bytes
+ uint8_t input2 = peek_input (n_start_offset + 1);
+ if ((input2 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+ // return output;
+ return 2;
+ }
+ else if ((input & 0xF0) == 0xE0)
+ {
+ // 3 bytes
+ uint8_t input2 = peek_input (n_start_offset + 1);
+ if ((input2 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ uint8_t input3 = peek_input (n_start_offset + 2);
+ if ((input3 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ /*uint32_t output
+ = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) <<
+ 0); return output;*/
+ return 3;
+ }
+ else if ((input & 0xF8) == 0xF0)
+ {
+ // 4 bytes
+ uint8_t input2 = peek_input (n_start_offset + 1);
+ if ((input2 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ uint8_t input3 = peek_input (n_start_offset + 2);
+ if ((input3 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ uint8_t input4 = peek_input (n_start_offset + 3);
+ if ((input4 & 0xC0) != 0x80)
+ // return 0xFFFE;
+ return 0;
+
+ /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+ | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
+ return output;*/
+ return 4;
+ }
+ else
+ {
+ rust_error_at (get_current_location (),
+ "invalid UTF-8 [THIRD] (too long)");
+ return 0;
+ }
+}
+
+// peeks the codepoint input at n codepoints ahead of current codepoint - try
+// not to use
+Codepoint
+Lexer::test_peek_codepoint_input (int n)
+{
+ int totalOffset = 0;
+
+ // add up all offsets into total offset? does this do what I want?
+ for (int i = 0; i < n; i++)
+ {
+ totalOffset += test_get_input_codepoint_n_length (totalOffset);
+ }
+ // issues: this would have (at least) O(n) lookup time, not O(1) like the
+ // rest?
+
+ // TODO: implement if still needed
+
+ // error out of function as it is not implemented
+ gcc_assert (1 == 0);
+ return {0};
+ /*
+ uint8_t input = peek_input();
+
+ if (input < 128) {
+ // ascii -- 1 byte
+ return input;
+ } else if ((input & 0xC0) == 0x80) {
+ // invalid (continuation; can't be first char)
+ return 0xFFFE;
+ } else if ((input & 0xE0) == 0xC0) {
+ // 2 bytes
+ uint8_t input2 = peek_input(1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
+ return output;
+ } else if ((input & 0xF0) == 0xE0) {
+ // 3 bytes
+ uint8_t input2 = peek_input(1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint8_t input3 = peek_input(2);
+ if ((input3 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint32_t output
+ = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 &
+ 0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) {
+ // 4 bytes
+ uint8_t input2 = peek_input(1);
+ if ((input2 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint8_t input3 = peek_input(2);
+ if ((input3 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint8_t input4 = peek_input(3);
+ if ((input4 & 0xC0) != 0x80)
+ return 0xFFFE;
+
+ uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
+ | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) <<
+ 0); return output; } else { rust_error_at(get_current_location(), "invalid
+ UTF-8 (too long)"); return 0xFFFE;
+ }*/
+}
+
+void
+Lexer::split_current_token (TokenId new_left, TokenId new_right)
+{
+ /* TODO: assert that this TokenId is a "simple token" like punctuation and not
+ * like "IDENTIFIER"? */
+ Location current_loc = peek_token ()->get_locus ();
+ TokenPtr new_left_tok = Token::make (new_left, current_loc);
+ TokenPtr new_right_tok = Token::make (new_right, current_loc + 1);
+
+ token_queue.replace_current_value (std::move (new_left_tok));
+ token_queue.insert (1, std::move (new_right_tok));
+}
+
+void
+Lexer::start_line (int current_line, int current_column)
+{
+ if (line_map)
+ line_map->start_line (current_line, current_column);
+}
+
+} // namespace Rust
new file mode 100644
@@ -0,0 +1,271 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_LEX_H
+#define RUST_LEX_H
+
+#include "rust-linemap.h"
+#include "rust-buffered-queue.h"
+#include "rust-token.h"
+
+namespace Rust {
+// Simple wrapper for FILE* that simplifies destruction.
+struct RAIIFile
+{
+private:
+ FILE *file;
+ const char *filename;
+
+ void close ()
+ {
+ if (file != nullptr && file != stdin)
+ fclose (file);
+ }
+
+public:
+ RAIIFile (const char *filename) : filename (filename)
+ {
+ if (strcmp (filename, "-") == 0)
+ file = stdin;
+ else
+ file = fopen (filename, "r");
+ }
+
+ /**
+ * Create a RAIIFile from an existing instance of FILE*
+ */
+ RAIIFile (FILE *raw, const char *filename = nullptr)
+ : file (raw), filename (filename)
+ {}
+
+ RAIIFile (const RAIIFile &other) = delete;
+ RAIIFile &operator= (const RAIIFile &other) = delete;
+
+ // have to specify setting file to nullptr, otherwise unintended fclose occurs
+ RAIIFile (RAIIFile &&other) : file (other.file), filename (other.filename)
+ {
+ other.file = nullptr;
+ }
+
+ RAIIFile &operator= (RAIIFile &&other)
+ {
+ close ();
+ file = other.file;
+ filename = other.filename;
+ other.file = nullptr;
+
+ return *this;
+ }
+
+ static RAIIFile create_error () { return RAIIFile (nullptr, nullptr); }
+
+ ~RAIIFile () { close (); }
+
+ FILE *get_raw () { return file; }
+ const char *get_filename () { return filename; }
+
+ bool ok () const { return file; }
+};
+
+class Lexer
+{
+private:
+ // Request new Location for current column in line_table
+ Location get_current_location ();
+
+ // Skips the current input char.
+ void skip_input ();
+ // Advances current input char to n + 1 chars ahead of current position.
+ void skip_input (int n);
+
+ // Returns char n chars ahead of current position.
+ int peek_input ();
+ // Peeks the current char.
+ int peek_input (int n);
+
+ // Classifies keyword (i.e. gets id for keyword).
+ TokenId classify_keyword (const std::string &str);
+
+ // Builds a token from the input queue.
+ TokenPtr build_token ();
+
+ std::tuple<std::string, int, bool> parse_in_decimal ();
+ std::pair<std::string, int> parse_in_exponent_part ();
+ std::pair<PrimitiveCoreType, int> parse_in_type_suffix ();
+ std::tuple<char, int, bool> parse_escape (char opening_char);
+ std::tuple<Codepoint, int, bool> parse_utf8_escape (char opening_char);
+ int parse_partial_string_continue ();
+ std::pair<long, int> parse_partial_hex_escape ();
+ std::pair<Codepoint, int> parse_partial_unicode_escape ();
+
+ int get_input_codepoint_length ();
+ int test_get_input_codepoint_n_length (int n_start_offset);
+ Codepoint peek_codepoint_input ();
+ Codepoint test_peek_codepoint_input (int n);
+ void skip_codepoint_input ();
+ void skip_broken_string_input (int current_char);
+
+ TokenPtr parse_byte_char (Location loc);
+ TokenPtr parse_byte_string (Location loc);
+ TokenPtr parse_raw_byte_string (Location loc);
+ TokenPtr parse_raw_identifier (Location loc);
+ TokenPtr parse_string (Location loc);
+ TokenPtr maybe_parse_raw_string (Location loc);
+ TokenPtr parse_raw_string (Location loc, int initial_hash_count);
+ TokenPtr parse_non_decimal_int_literals (Location loc);
+ TokenPtr parse_decimal_int_or_float (Location loc);
+ TokenPtr parse_char_or_lifetime (Location loc);
+ TokenPtr parse_identifier_or_keyword (Location loc);
+
+ template <typename IsDigitFunc>
+ TokenPtr parse_non_decimal_int_literal (Location loc,
+ IsDigitFunc is_digit_func,
+ std::string existent_str, int base);
+
+public:
+ // Construct lexer with input file and filename provided
+ Lexer (const char *filename, RAIIFile input, Linemap *linemap);
+
+ // Lex the contents of a string instead of a file
+ Lexer (const std::string &input);
+
+ // dtor
+ ~Lexer ();
+
+ // don't allow copy semantics (for now, at least)
+ Lexer (const Lexer &other) = delete;
+ Lexer &operator= (const Lexer &other) = delete;
+
+ // enable move semantics
+ Lexer (Lexer &&other) = default;
+ Lexer &operator= (Lexer &&other) = default;
+
+ // Returns token n tokens ahead of current position.
+ const_TokenPtr peek_token (int n) { return token_queue.peek (n); }
+ // Peeks the current token.
+ const_TokenPtr peek_token () { return peek_token (0); }
+
+ // Advances current token to n + 1 tokens ahead of current position.
+ void skip_token (int n) { token_queue.skip (n); }
+ // Skips the current token.
+ void skip_token () { skip_token (0); }
+
+ // Replaces the current token with a specified token.
+ void replace_current_token (TokenPtr replacement);
+ // FIXME: don't use anymore
+
+ /* Splits the current token into two. Intended for use with nested generics
+ * closes (i.e. T<U<X>> where >> is wrongly lexed as one token). Note that
+ * this will only work with "simple" tokens like punctuation. */
+ void split_current_token (TokenId new_left, TokenId new_right);
+
+ Linemap *get_line_map () { return line_map; }
+ std::string get_filename () { return std::string (input.get_filename ()); }
+
+private:
+ void start_line (int current_line, int current_column);
+
+ // File for use as input.
+ RAIIFile input;
+ // TODO is this actually required? could just have file storage in InputSource
+
+ // Current line number.
+ int current_line;
+ // Current column number.
+ int current_column;
+ // Current character.
+ int current_char;
+ // Line map.
+ Linemap *line_map;
+
+ /* Max column number that can be quickly allocated - higher may require
+ * allocating new linemap */
+ static const int max_column_hint = 80;
+
+ // Input source wrapper thing.
+ class InputSource
+ {
+ public:
+ virtual ~InputSource () {}
+
+ // Overload operator () to return next char from input stream.
+ virtual int next () = 0;
+ };
+
+ class FileInputSource : public InputSource
+ {
+ private:
+ // Input source file.
+ FILE *input;
+
+ public:
+ // Create new input source from file.
+ FileInputSource (FILE *input) : input (input) {}
+
+ int next () override { return fgetc (input); }
+ };
+
+ class BufferInputSource : public InputSource
+ {
+ private:
+ const std::string &buffer;
+ size_t offs;
+
+ public:
+ // Create new input source from file.
+ BufferInputSource (const std::string &b, size_t offset)
+ : buffer (b), offs (offset)
+ {}
+
+ int next () override
+ {
+ if (offs >= buffer.size ())
+ return EOF;
+
+ return buffer.at (offs++);
+ }
+ };
+
+ // The input source for the lexer.
+ // InputSource input_source;
+ // Input file queue.
+ std::unique_ptr<InputSource> raw_input_source;
+ buffered_queue<int, InputSource &> input_queue;
+
+ // Token source wrapper thing.
+ struct TokenSource
+ {
+ // The lexer object that will use this TokenSource.
+ Lexer *lexer;
+
+ // Create a new TokenSource with given lexer.
+ TokenSource (Lexer *parLexer) : lexer (parLexer) {}
+
+ // Overload operator () to build token in lexer.
+ TokenPtr next () { return lexer->build_token (); }
+ };
+
+ // The token source for the lexer.
+ // TokenSource token_source;
+ // Token stream queue.
+ buffered_queue<std::shared_ptr<Token>, TokenSource> token_queue;
+};
+
+} // namespace Rust
+
+#endif
new file mode 100644
@@ -0,0 +1,135 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+#include "rust-token.h"
+
+#include "rust-diagnostics.h" // for error_at
+
+namespace Rust {
+// Hackily defined way to get token description for enum value using x-macros
+const char *
+get_token_description (TokenId id)
+{
+ switch (id)
+ {
+#define RS_TOKEN(name, descr) \
+ case name: \
+ return descr;
+#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y)
+ RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* Hackily defined way to get token description as a string for enum value using
+ * x-macros */
+const char *
+token_id_to_str (TokenId id)
+{
+ switch (id)
+ {
+#define RS_TOKEN(name, _) \
+ case name: \
+ return #name;
+#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y)
+ RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+ default:
+ gcc_unreachable ();
+ }
+}
+
+const char *
+get_type_hint_string (PrimitiveCoreType type)
+{
+ switch (type)
+ {
+ case CORETYPE_BOOL:
+ return "bool";
+ case CORETYPE_CHAR:
+ return "char";
+ case CORETYPE_STR:
+ return "str";
+ // case CORETYPE_INT:
+ case CORETYPE_ISIZE:
+ return "isize";
+ // case CORETYPE_UINT:
+ case CORETYPE_USIZE:
+ return "usize";
+ case CORETYPE_F32:
+ return "f32";
+ case CORETYPE_F64:
+ return "f64";
+ case CORETYPE_I8:
+ return "i8";
+ case CORETYPE_I16:
+ return "i16";
+ case CORETYPE_I32:
+ return "i32";
+ case CORETYPE_I64:
+ return "i64";
+ case CORETYPE_I128:
+ return "i128";
+ case CORETYPE_U8:
+ return "u8";
+ case CORETYPE_U16:
+ return "u16";
+ case CORETYPE_U32:
+ return "u32";
+ case CORETYPE_U64:
+ return "u64";
+ case CORETYPE_U128:
+ return "u128";
+ case CORETYPE_PURE_DECIMAL:
+ return "pure_decimal";
+ case CORETYPE_UNKNOWN:
+ default:
+ return "unknown";
+ }
+}
+
+const char *
+Token::get_type_hint_str () const
+{
+ return get_type_hint_string (type_hint);
+}
+
+const std::string &
+Token::get_str () const
+{
+ // FIXME: attempt to return null again
+ // gcc_assert(str != NULL);
+
+ // HACK: allow referencing an empty string
+ static const std::string empty = "";
+
+ if (str == NULL)
+ {
+ rust_error_at (get_locus (),
+ "attempted to get string for %<%s%>, which has no string. "
+ "returning empty string instead",
+ get_token_description ());
+ return empty;
+ }
+ return *str;
+}
+} // namespace Rust
new file mode 100644
@@ -0,0 +1,455 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_TOKEN_H
+#define RUST_TOKEN_H
+
+#include "rust-linemap.h"
+#include "rust-codepoint.h"
+
+// order: config, system, coretypes, input
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "input.h"
+
+namespace Rust {
+// "Primitive core types" in Rust - the different int and float types, as well
+// as some others
+enum PrimitiveCoreType
+{
+ CORETYPE_UNKNOWN,
+ // named primitives
+ CORETYPE_BOOL,
+ CORETYPE_CHAR,
+ CORETYPE_STR,
+ // okay technically int and uint are arch-dependent (pointer size)
+ CORETYPE_INT,
+ CORETYPE_UINT,
+ // numbered number primitives
+ CORETYPE_F32,
+ CORETYPE_F64,
+ CORETYPE_I8,
+ CORETYPE_I16,
+ CORETYPE_I32,
+ CORETYPE_I64,
+ CORETYPE_I128,
+ CORETYPE_U8,
+ CORETYPE_U16,
+ CORETYPE_U32,
+ CORETYPE_U64,
+ CORETYPE_U128,
+ // Pure decimals are used for tuple index.
+ // Also means there is no type hint.
+ CORETYPE_PURE_DECIMAL,
+ // arch-dependent pointer sizes
+ CORETYPE_ISIZE = CORETYPE_INT,
+ CORETYPE_USIZE = CORETYPE_UINT
+};
+
+// RS_TOKEN(name, description)
+// RS_TOKEN_KEYWORD(name, identifier)
+//
+// Keep RS_TOKEN_KEYWORD sorted
+
+/* note that abstract, async, become, box, do, final, macro, override, priv,
+ * try, typeof, unsized, virtual, and yield are unused */
+#define RS_TOKEN_LIST \
+ RS_TOKEN (FIRST_TOKEN, "<first-token-marker>") \
+ RS_TOKEN (END_OF_FILE, "end of file") \
+ RS_TOKEN (EXCLAM, "!") \
+ RS_TOKEN (NOT_EQUAL, "!=") \
+ RS_TOKEN (PERCENT, "%") \
+ RS_TOKEN (PERCENT_EQ, "%=") \
+ RS_TOKEN (AMP, "&") \
+ RS_TOKEN (AMP_EQ, "&=") \
+ RS_TOKEN (LOGICAL_AND, "&&") \
+ RS_TOKEN (ASTERISK, "*") \
+ RS_TOKEN (ASTERISK_EQ, "*=") \
+ RS_TOKEN (PLUS, "+") \
+ RS_TOKEN (PLUS_EQ, "+=") \
+ RS_TOKEN (COMMA, ",") \
+ RS_TOKEN (MINUS, "-") \
+ RS_TOKEN (MINUS_EQ, "-=") \
+ RS_TOKEN (RETURN_TYPE, "->") \
+ RS_TOKEN (DOT, ".") \
+ RS_TOKEN (DOT_DOT, "..") \
+ RS_TOKEN (DOT_DOT_EQ, "..=") \
+ RS_TOKEN (ELLIPSIS, "...") \
+ RS_TOKEN (DIV, "/") \
+ RS_TOKEN (DIV_EQ, "/=") \
+ RS_TOKEN (COLON, ":") \
+ RS_TOKEN (SEMICOLON, ";") \
+ RS_TOKEN (LEFT_SHIFT, "<<") \
+ RS_TOKEN (LEFT_SHIFT_EQ, "<<=") \
+ RS_TOKEN (LEFT_ANGLE, "<") \
+ RS_TOKEN (LESS_OR_EQUAL, "<=") \
+ RS_TOKEN (EQUAL, "=") \
+ RS_TOKEN (EQUAL_EQUAL, "==") \
+ RS_TOKEN (MATCH_ARROW, "=>") \
+ RS_TOKEN (RIGHT_ANGLE, ">") \
+ RS_TOKEN (GREATER_OR_EQUAL, ">=") \
+ RS_TOKEN (RIGHT_SHIFT, ">>") \
+ RS_TOKEN (RIGHT_SHIFT_EQ, ">>=") \
+ RS_TOKEN (PATTERN_BIND, "@") \
+ RS_TOKEN (TILDE, "~") \
+ RS_TOKEN (BACKSLASH, "\\") \
+ RS_TOKEN (BACKTICK, "`") \
+ RS_TOKEN (CARET, "^") \
+ RS_TOKEN (CARET_EQ, "^=") \
+ RS_TOKEN (PIPE, "|") \
+ RS_TOKEN (PIPE_EQ, "|=") \
+ RS_TOKEN (OR, "||") \
+ RS_TOKEN (QUESTION_MARK, "?") \
+ RS_TOKEN (HASH, "#") \
+ /* from here on, dodgy and may not be correct. not operators and may be \
+ * symbols */ \
+ /* RS_TOKEN(SPACE, " ") probably too dodgy */ \
+ /* RS_TOKEN(NEWLINE, "\n")*/ \
+ RS_TOKEN (SCOPE_RESOLUTION, "::") /* dodgy */ \
+ RS_TOKEN (SINGLE_QUOTE, "'") /* should i differentiate from lifetime? */ \
+ RS_TOKEN (DOUBLE_QUOTE, "\"") \
+ RS_TOKEN (UNDERSCORE, \
+ "_") /* TODO: treat as reserved word like mrustc instead? */ \
+ RS_TOKEN (IDENTIFIER, "identifier") \
+ RS_TOKEN (INT_LITERAL, \
+ "integer literal") /* do different int and float types need \
+ different literal types? */ \
+ RS_TOKEN (FLOAT_LITERAL, "float literal") \
+ RS_TOKEN (STRING_LITERAL, "string literal") \
+ RS_TOKEN (CHAR_LITERAL, "character literal") \
+ RS_TOKEN (BYTE_STRING_LITERAL, "byte string literal") \
+ RS_TOKEN (BYTE_CHAR_LITERAL, "byte character literal") \
+ RS_TOKEN (LIFETIME, "lifetime") /* TODO: improve token type */ \
+ /* Have "interpolated" tokens (whatever that means)? identifer, path, type, \
+ * pattern, */ \
+ /* expression, statement, block, meta, item in mrustc (but not directly in \
+ * lexer). */ \
+ RS_TOKEN (LEFT_PAREN, "(") \
+ RS_TOKEN (RIGHT_PAREN, ")") \
+ RS_TOKEN (LEFT_CURLY, "{") \
+ RS_TOKEN (RIGHT_CURLY, "}") \
+ RS_TOKEN (LEFT_SQUARE, "[") \
+ RS_TOKEN (RIGHT_SQUARE, "]") \
+ /* Macros */ \
+ RS_TOKEN (DOLLAR_SIGN, "$") \
+ /* Doc Comments */ \
+ RS_TOKEN (INNER_DOC_COMMENT, "#![doc]") \
+ RS_TOKEN (OUTER_DOC_COMMENT, "#[doc]") \
+ /* have "weak" union and 'static keywords? */ \
+ \
+ RS_TOKEN_KEYWORD (ABSTRACT, "abstract") /* unused */ \
+ RS_TOKEN_KEYWORD (AS, "as") \
+ RS_TOKEN_KEYWORD (ASYNC, "async") /* unused */ \
+ RS_TOKEN_KEYWORD (BECOME, "become") /* unused */ \
+ RS_TOKEN_KEYWORD (BOX, "box") /* unused */ \
+ RS_TOKEN_KEYWORD (BREAK, "break") \
+ RS_TOKEN_KEYWORD (CONST, "const") \
+ RS_TOKEN_KEYWORD (CONTINUE, "continue") \
+ RS_TOKEN_KEYWORD (CRATE, "crate") \
+ /* FIXME: Do we need to add $crate (DOLLAR_CRATE) as a reserved kw? */ \
+ RS_TOKEN_KEYWORD (DO, "do") /* unused */ \
+ RS_TOKEN_KEYWORD (DYN, "dyn") \
+ RS_TOKEN_KEYWORD (ELSE, "else") \
+ RS_TOKEN_KEYWORD (ENUM_TOK, "enum") \
+ RS_TOKEN_KEYWORD (EXTERN_TOK, "extern") \
+ RS_TOKEN_KEYWORD (FALSE_LITERAL, "false") \
+ RS_TOKEN_KEYWORD (FINAL_TOK, "final") /* unused */ \
+ RS_TOKEN_KEYWORD (FN_TOK, "fn") \
+ RS_TOKEN_KEYWORD (FOR, "for") \
+ RS_TOKEN_KEYWORD (IF, "if") \
+ RS_TOKEN_KEYWORD (IMPL, "impl") \
+ RS_TOKEN_KEYWORD (IN, "in") \
+ RS_TOKEN_KEYWORD (LET, "let") \
+ RS_TOKEN_KEYWORD (LOOP, "loop") \
+ RS_TOKEN_KEYWORD (MACRO, "macro") /* unused */ \
+ RS_TOKEN_KEYWORD (MATCH_TOK, "match") \
+ RS_TOKEN_KEYWORD (MOD, "mod") \
+ RS_TOKEN_KEYWORD (MOVE, "move") \
+ RS_TOKEN_KEYWORD (MUT, "mut") \
+ RS_TOKEN_KEYWORD (OVERRIDE_TOK, "override") /* unused */ \
+ RS_TOKEN_KEYWORD (PRIV, "priv") /* unused */ \
+ RS_TOKEN_KEYWORD (PUB, "pub") \
+ RS_TOKEN_KEYWORD (REF, "ref") \
+ RS_TOKEN_KEYWORD (RETURN_TOK, "return") \
+ RS_TOKEN_KEYWORD (SELF_ALIAS, \
+ "Self") /* mrustc does not treat this as a reserved word*/ \
+ RS_TOKEN_KEYWORD (SELF, "self") \
+ RS_TOKEN_KEYWORD (STATIC_TOK, "static") \
+ RS_TOKEN_KEYWORD (STRUCT_TOK, "struct") \
+ RS_TOKEN_KEYWORD (SUPER, "super") \
+ RS_TOKEN_KEYWORD (TRAIT, "trait") \
+ RS_TOKEN_KEYWORD (TRUE_LITERAL, "true") \
+ RS_TOKEN_KEYWORD (TRY, "try") /* unused */ \
+ RS_TOKEN_KEYWORD (TYPE, "type") \
+ RS_TOKEN_KEYWORD (TYPEOF, "typeof") /* unused */ \
+ RS_TOKEN_KEYWORD (UNSAFE, "unsafe") \
+ RS_TOKEN_KEYWORD (UNSIZED, "unsized") /* unused */ \
+ RS_TOKEN_KEYWORD (USE, "use") \
+ RS_TOKEN_KEYWORD (VIRTUAL, "virtual") /* unused */ \
+ RS_TOKEN_KEYWORD (WHERE, "where") \
+ RS_TOKEN_KEYWORD (WHILE, "while") \
+ RS_TOKEN_KEYWORD (YIELD, "yield") /* unused */ \
+ \
+ RS_TOKEN (LAST_TOKEN, "<last-token-marker>")
+
+// Contains all token types. Crappy implementation via x-macros.
+enum TokenId
+{
+#define RS_TOKEN(name, _) name,
+#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y)
+ RS_TOKEN_LIST
+#undef RS_TOKEN_KEYWORD
+#undef RS_TOKEN
+};
+
+// dodgy "TokenPtr" declaration with Token forward declaration
+class Token;
+// A smart pointer (shared_ptr) to Token.
+typedef std::shared_ptr<Token> TokenPtr;
+// A smart pointer (shared_ptr) to a constant Token.
+typedef std::shared_ptr<const Token> const_TokenPtr;
+
+// Hackily defined way to get token description for enum value using x-macros
+const char *
+get_token_description (TokenId id);
+/* Hackily defined way to get token description as a string for enum value using
+ * x-macros */
+const char *
+token_id_to_str (TokenId id);
+// Get type hint description as a string.
+const char *
+get_type_hint_string (PrimitiveCoreType type);
+
+// Represents a single token. Create using factory static methods.
+class Token
+{
+private:
+ // Token kind.
+ TokenId token_id;
+ // Token location.
+ Location locus;
+ // Associated text (if any) of token.
+ std::unique_ptr<std::string> str;
+ // TODO: maybe remove issues and just store std::string as value?
+ /* Type hint for token based on lexer data (e.g. type suffix). Does not exist
+ * for most tokens. */
+ PrimitiveCoreType type_hint;
+
+ // Token constructor from token id and location. Has a null string.
+ Token (TokenId token_id, Location location)
+ : token_id (token_id), locus (location), str (nullptr),
+ type_hint (CORETYPE_UNKNOWN)
+ {}
+
+ // Token constructor from token id, location, and a string.
+ Token (TokenId token_id, Location location, std::string &¶mStr)
+ : token_id (token_id), locus (location),
+ str (new std::string (std::move (paramStr))), type_hint (CORETYPE_UNKNOWN)
+ {}
+
+ // Token constructor from token id, location, and a char.
+ Token (TokenId token_id, Location location, char paramChar)
+ : token_id (token_id), locus (location),
+ str (new std::string (1, paramChar)), type_hint (CORETYPE_UNKNOWN)
+ {}
+
+ // Token constructor from token id, location, and a "codepoint".
+ Token (TokenId token_id, Location location, Codepoint paramCodepoint)
+ : token_id (token_id), locus (location),
+ str (new std::string (paramCodepoint.as_string ())),
+ type_hint (CORETYPE_UNKNOWN)
+ {}
+
+ // Token constructor from token id, location, a string, and type hint.
+ Token (TokenId token_id, Location location, std::string &¶mStr,
+ PrimitiveCoreType parType)
+ : token_id (token_id), locus (location),
+ str (new std::string (std::move (paramStr))), type_hint (parType)
+ {}
+
+public:
+ // No default constructor.
+ Token () = delete;
+ // Do not copy/assign tokens.
+ Token (const Token &) = delete;
+ Token &operator= (const Token &) = delete;
+
+ // Allow moving tokens.
+ Token (Token &&other) = default;
+ Token &operator= (Token &&other) = default;
+
+ ~Token () = default;
+
+ /* TODO: make_shared (which saves a heap allocation) does not work with the
+ * private constructor */
+
+ // Makes and returns a new TokenPtr (with null string).
+ static TokenPtr make (TokenId token_id, Location locus)
+ {
+ // return std::make_shared<Token> (token_id, locus);
+ return TokenPtr (new Token (token_id, locus));
+ }
+
+ // Makes and returns a new TokenPtr of type IDENTIFIER.
+ static TokenPtr make_identifier (Location locus, std::string &&str)
+ {
+ // return std::make_shared<Token> (IDENTIFIER, locus, str);
+ return TokenPtr (new Token (IDENTIFIER, locus, std::move (str)));
+ }
+
+ // Makes and returns a new TokenPtr of type INT_LITERAL.
+ static TokenPtr make_int (Location locus, std::string &&str,
+ PrimitiveCoreType type_hint = CORETYPE_UNKNOWN)
+ {
+ // return std::make_shared<Token> (INT_LITERAL, locus, str, type_hint);
+ return TokenPtr (
+ new Token (INT_LITERAL, locus, std::move (str), type_hint));
+ }
+
+ // Makes and returns a new TokenPtr of type FLOAT_LITERAL.
+ static TokenPtr make_float (Location locus, std::string &&str,
+ PrimitiveCoreType type_hint = CORETYPE_UNKNOWN)
+ {
+ // return std::make_shared<Token> (FLOAT_LITERAL, locus, str, type_hint);
+ return TokenPtr (
+ new Token (FLOAT_LITERAL, locus, std::move (str), type_hint));
+ }
+
+ // Makes and returns a new TokenPtr of type STRING_LITERAL.
+ static TokenPtr make_string (Location locus, std::string &&str)
+ {
+ // return std::make_shared<Token> (STRING_LITERAL, locus, str,
+ // CORETYPE_STR);
+ return TokenPtr (
+ new Token (STRING_LITERAL, locus, std::move (str), CORETYPE_STR));
+ }
+
+ // Makes and returns a new TokenPtr of type CHAR_LITERAL.
+ static TokenPtr make_char (Location locus, Codepoint char_lit)
+ {
+ // return std::make_shared<Token> (CHAR_LITERAL, locus, char_lit);
+ return TokenPtr (new Token (CHAR_LITERAL, locus, char_lit));
+ }
+
+ // Makes and returns a new TokenPtr of type BYTE_CHAR_LITERAL.
+ static TokenPtr make_byte_char (Location locus, char byte_char)
+ {
+ // return std::make_shared<Token> (BYTE_CHAR_LITERAL, locus, byte_char);
+ return TokenPtr (new Token (BYTE_CHAR_LITERAL, locus, byte_char));
+ }
+
+ // Makes and returns a new TokenPtr of type BYTE_STRING_LITERAL (fix).
+ static TokenPtr make_byte_string (Location locus, std::string &&str)
+ {
+ // return std::make_shared<Token> (BYTE_STRING_LITERAL, locus, str);
+ return TokenPtr (new Token (BYTE_STRING_LITERAL, locus, std::move (str)));
+ }
+
+ // Makes and returns a new TokenPtr of type INNER_DOC_COMMENT.
+ static TokenPtr make_inner_doc_comment (Location locus, std::string &&str)
+ {
+ return TokenPtr (new Token (INNER_DOC_COMMENT, locus, std::move (str)));
+ }
+
+ // Makes and returns a new TokenPtr of type OUTER_DOC_COMMENT.
+ static TokenPtr make_outer_doc_comment (Location locus, std::string &&str)
+ {
+ return TokenPtr (new Token (OUTER_DOC_COMMENT, locus, std::move (str)));
+ }
+
+ // Makes and returns a new TokenPtr of type LIFETIME.
+ static TokenPtr make_lifetime (Location locus, std::string &&str)
+ {
+ // return std::make_shared<Token> (LIFETIME, locus, str);
+ return TokenPtr (new Token (LIFETIME, locus, std::move (str)));
+ }
+
+ // Gets id of the token.
+ TokenId get_id () const { return token_id; }
+
+ // Gets location of the token.
+ Location get_locus () const { return locus; }
+
+ // Gets string description of the token.
+ const std::string &
+ get_str () const; /*{
+// FIXME: put in header again when fix null problem
+//gcc_assert(str != nullptr);
+if (str == nullptr) {
+error_at(get_locus(), "attempted to get string for '%s', which has no string.
+returning empty string instead.", get_token_description()); return "";
+}
+return *str;
+}*/
+
+ // Gets token's type hint info.
+ PrimitiveCoreType get_type_hint () const
+ {
+ return type_hint == CORETYPE_PURE_DECIMAL ? CORETYPE_UNKNOWN : type_hint;
+ }
+
+ // diagnostics (error reporting)
+ const char *get_token_description () const
+ {
+ return Rust::get_token_description (token_id);
+ }
+
+ // debugging
+ const char *token_id_to_str () const
+ {
+ return Rust::token_id_to_str (token_id);
+ }
+
+ // debugging
+ const char *get_type_hint_str () const;
+
+ /* Returns whether the token is a literal of any type (int, float, char,
+ * string, byte char, byte string). */
+ bool is_literal () const
+ {
+ switch (token_id)
+ {
+ case INT_LITERAL:
+ case FLOAT_LITERAL:
+ case CHAR_LITERAL:
+ case STRING_LITERAL:
+ case BYTE_CHAR_LITERAL:
+ case BYTE_STRING_LITERAL:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /* Returns whether the token actually has a string (regardless of whether it
+ * should or not). */
+ bool has_str () const { return str != nullptr; }
+
+ // Returns whether the token should have a string.
+ bool should_have_str () const
+ {
+ return is_literal () || token_id == IDENTIFIER || token_id == LIFETIME;
+ }
+
+ // Returns whether the token is a pure decimal int literal
+ bool is_pure_decimal () const { return type_hint == CORETYPE_PURE_DECIMAL; }
+};
+} // namespace Rust
+
+#endif
new file mode 100644
@@ -0,0 +1,204 @@
+// Copyright (C) 2020-2022 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+#ifndef RUST_BUFFERED_QUEUE_H
+#define RUST_BUFFERED_QUEUE_H
+
+#include "rust-system.h"
+
+namespace Rust {
+/* Buffered queue implementation. Items are of type T, queue source is of type
+ * Source. Note that this is owning of the source. */
+template <typename T, typename Source> class buffered_queue
+{
+public:
+ // Construct empty queue from Source src.
+ buffered_queue (Source src) : source (src), start (0), end (0), buffer () {}
+
+ /* disable copying (since source is probably non-copyable)
+ * TODO is this actually a good idea? If source is non-copyable, it would
+ * just delete the copy constructor anyway.*/
+ buffered_queue (const buffered_queue &other) = delete;
+ buffered_queue &operator= (const buffered_queue &other) = delete;
+
+ // enable moving
+ buffered_queue (buffered_queue &&other) = default;
+ buffered_queue &operator= (buffered_queue &&other) = default;
+
+ // Returns token at position start + n (i.e. n tokens ahead).
+ T peek (int n)
+ {
+ // n should not be behind
+ rust_assert (n >= 0);
+
+ int num_queued_items = end - start;
+ int num_items_required = n + 1;
+
+ // if required items go past end of queue, add them to queue
+ if (num_items_required > num_queued_items)
+ {
+ int num_items_to_read = num_items_required - num_queued_items;
+
+ /* if queue length + extra items is larger than buffer size, expand
+ * buffer */
+ if (end + num_items_to_read > (int) buffer.size ())
+ {
+ // Resize the buffer by 1.5x
+ int new_size = (buffer.size () + num_items_to_read);
+ new_size += (new_size >> 1);
+
+ // old method:
+ /*
+ // create new queue buffer with new size
+ std::vector<T> new_queue (new_size);
+ std::copy (buffer.begin () + start, buffer.begin () + end,
+ new_queue.begin ());
+ start = 0;
+ end = num_queued_items;
+ // TODO: would move be better here? optimisation for move with
+ // shared pointer?
+
+ // swap member buffer and new queue buffer
+ std::swap (buffer, new_queue);
+ */
+
+ // TODO: determine overhead of this approach vs copy. Should be
+ // lower.
+ std::vector<T> new_queue;
+ new_queue.reserve (new_size);
+ new_queue.insert (new_queue.begin (),
+ std::make_move_iterator (buffer.begin () + start),
+ std::make_move_iterator (buffer.begin () + end));
+ start = 0;
+ end = num_queued_items;
+ // fill up rest of vector with junk so that indexing can work
+ new_queue.insert (new_queue.begin () + end,
+ new_size - new_queue.size (), T ());
+
+ buffer = std::move (new_queue);
+ /* this should be best method - std::move(range) would have
+ * allocation problems; initial construction would require
+ * reallocation upon resizing */
+
+ // validate that buffer is large enough now
+ rust_assert (end + num_items_to_read <= (int) buffer.size ());
+ }
+
+ /* iterate through buffer and invoke operator () on source on values
+ * past original end */
+ for (int i = 0; i < num_items_to_read; i++)
+ buffer[end + i] = source.next ();
+
+ // move end based on additional items added
+ end += num_items_to_read;
+ }
+
+ rust_assert (0 <= start);
+ rust_assert (start <= end);
+ rust_assert (end <= (int) buffer.size ());
+
+ rust_assert (start + n < end);
+
+ // return value at start + n in buffer
+ return buffer[start + n];
+ }
+
+ /* TODO: add faster peek current token to remove overhead of conditional
+ * branches? */
+
+ // Advances start by n + 1.
+ void skip (int n)
+ {
+ // Call peek to ensure requested n is actually in queue.
+ peek (n);
+
+ // Clear queue values from start to n (inclusive).
+ for (int i = 0; i < (n + 1); i++)
+ buffer[start + i] = T ();
+
+ // Move start forward by n + 1.
+ start += (n + 1);
+
+ // Ensure start is not impossible somehow
+ rust_assert (0 <= start);
+ rust_assert (start <= end);
+
+ // Compact buffer if empty
+ if (start == end)
+ start = end = 0;
+ }
+
+ /* Inserts element at front of vector. Really dirty hack with terrible
+ * performance, only use when really needed. */
+ void insert_at_front (T elem_to_insert)
+ {
+ // TODO: test as this may not work properly
+
+ // Insert actual element in buffer at start.
+ buffer.insert (buffer.begin (), elem_to_insert);
+
+ /* Increase the end number since added element means all others have shifted
+ * one along */
+ end++;
+ }
+
+ // Insert at arbitrary position (attempt)
+ void insert (int index, T elem_to_insert)
+ {
+ // TODO: test as this may not work properly
+
+ // n should not be behind
+ rust_assert (index >= 0);
+
+ // call peek to ensure that the items behind this (at least) are in queue
+ if (index >= 1)
+ peek (index - 1);
+ else
+ peek (index);
+
+ buffer.insert (buffer.begin () + start + index, std::move (elem_to_insert));
+
+ end++;
+ }
+
+ // Replaces the current value in the buffer. Total HACK.
+ void replace_current_value (T replacement)
+ {
+ // call peek to ensure value exists
+ peek (0);
+
+ buffer[start] = std::move (replacement);
+
+ // don't move start or end
+ }
+
+private:
+ // Source of tokens for queue.
+ Source source;
+
+ // Begin of range in buffer, inclusive.
+ int start;
+ // End of range in buffer, exclusive.
+ int end;
+
+ // Queue buffer.
+ std::vector<T> buffer;
+};
+} // namespace Rust
+
+#endif