From 3407b86171db6eb70fddb7d42a8ce78af715ec9c Mon Sep 17 00:00:00 2001 From: orosmatthew Date: Wed, 4 Oct 2023 14:59:32 -0400 Subject: [PATCH] Initial utf8view --- src/html_parse.cpp | 102 ++++++++++++++++++++++----------------------- src/html_parse.hpp | 20 +++++++-- 2 files changed, 68 insertions(+), 54 deletions(-) diff --git a/src/html_parse.cpp b/src/html_parse.cpp index ad1b4e4..3e2d691 100644 --- a/src/html_parse.cpp +++ b/src/html_parse.cpp @@ -40,7 +40,6 @@ static bool is_newline(char32_t c) std::vector Tokenizer::tokenize() { - std::string buffer; std::vector tokens; utf8::iterator it(m_source.begin(), m_source.begin(), m_source.end()); @@ -48,84 +47,77 @@ std::vector Tokenizer::tokenize() for (; it != end; ++it) { if (is_alpha(*it)) { - do { - utf8::append(*it, std::back_inserter(buffer)); - } while (++it != end && (is_alpha_num(*it) || *it == '-')); + auto begin = it; + while (++it != end && (is_alpha_num(*it) || *it == '-')) { } + Token token { .type = TokenType::ident, .view { begin, it } }; --it; - tokens.push_back({ .type = TokenType::ident, .value = buffer }); - buffer.clear(); } else if (*it == '<') { - tokens.push_back({ .type = TokenType::lt }); + tokens.push_back({ .type = TokenType::lt, .view { it } }); } else if (*it == '>') { - tokens.push_back({ .type = TokenType::gt }); + tokens.push_back({ .type = TokenType::gt, .view { it } }); } else if (*it == '!') { - tokens.push_back({ .type = TokenType::exclaim }); + tokens.push_back({ .type = TokenType::exclaim, .view { it } }); } else if (*it == '/') { - tokens.push_back({ .type = TokenType::fslash }); + tokens.push_back({ .type = TokenType::fslash, .view { it } }); } else if (*it == '=') { - tokens.push_back({ .type = TokenType::eq }); + tokens.push_back({ .type = TokenType::eq, .view { it } }); } else if (*it == '-') { - tokens.push_back({ .type = TokenType::minus }); + tokens.push_back({ .type = TokenType::minus, .view { it } }); } else if (*it == ':') { - tokens.push_back({ .type = TokenType::colon }); + tokens.push_back({ .type = TokenType::colon, .view { it } }); } else if (*it == '#') { - tokens.push_back({ .type = TokenType::hash }); + tokens.push_back({ .type = TokenType::hash, .view { it } }); } else if (*it == ';') { - tokens.push_back({ .type = TokenType::semi }); + tokens.push_back({ .type = TokenType::semi, .view { it } }); } else if (is_num(*it)) { - do { - utf8::append(*it, std::back_inserter(buffer)); - } while (++it != end && is_num(*it)); + auto begin = it; + while (++it != end && is_num(*it)) { }; + tokens.push_back({ .type = TokenType::num, .view { begin, it } }); --it; - tokens.push_back({ .type = TokenType::num, .value = buffer }); - buffer.clear(); } else if (*it == ',') { - tokens.push_back({ .type = TokenType::comma }); + tokens.push_back({ .type = TokenType::comma, .view { it } }); } else if (*it == '{') { - tokens.push_back({ .type = TokenType::left_curly }); + tokens.push_back({ .type = TokenType::left_curly, .view { it } }); } else if (*it == '}') { - tokens.push_back({ .type = TokenType::right_curly }); + tokens.push_back({ .type = TokenType::right_curly, .view { it } }); } else if (*it == '(') { - tokens.push_back({ .type = TokenType::left_paren }); + tokens.push_back({ .type = TokenType::left_paren, .view { it } }); } else if (*it == ')') { - tokens.push_back({ .type = TokenType::right_paren }); + tokens.push_back({ .type = TokenType::right_paren, .view { it } }); } else if (*it == '.') { - tokens.push_back({ .type = TokenType::dot }); + tokens.push_back({ .type = TokenType::dot, .view { it } }); } else if (*it == '@') { - tokens.push_back({ .type = TokenType::at }); + tokens.push_back({ .type = TokenType::at, .view { it } }); } else if (*it == '"') { + auto begin = it; ++it; while (it != end && *it != '"') { - utf8::append(*it, std::back_inserter(buffer)); ++it; } - tokens.push_back({ .type = TokenType::str, .value = buffer }); - buffer.clear(); + tokens.push_back({ .type = TokenType::str, .view { begin, it } }); } else if (is_space(*it) || is_newline(*it)) { } else { - utf8::append(*it, std::back_inserter(buffer)); - std::cout << "[WARN] Unexpected token: " << buffer << std::endl; - buffer.clear(); + std::cout << "[WARN] Unexpected token" << std::endl; } } @@ -140,13 +132,11 @@ std::string Token::to_string() case TokenType::gt: return ">"; case TokenType::ident: - assert(value.has_value()); - return value.value(); + return "IDENT"; // TODO case TokenType::fslash: return "/"; case TokenType::str: - assert(value.has_value()); - return value.value(); + return "STR"; // TODO case TokenType::exclaim: return "!"; case TokenType::eq: @@ -174,8 +164,7 @@ std::string Token::to_string() case TokenType::semi: return ";"; case TokenType::num: - assert(value.has_value()); - return value.value(); + return "NUM"; // TODO default: assert(false && "Unimplemented"); return ""; @@ -238,8 +227,7 @@ std::optional Parser::parse_doc_type() consume(); consume(); consume(); - NodeDocType doc_type; - doc_type.type = consume().value.value(); + NodeDocType doc_type { .type = consume().view }; consume(); return doc_type; } @@ -254,17 +242,9 @@ bool Parser::peek_is(size_t ahead, TokenType type) return peek(ahead).value().get().type == type; } -bool Parser::peek_is_with_val(size_t ahead, TokenType type) -{ - if (!peek_is(ahead, type)) { - return false; - } - return peek(ahead).value().get().value.has_value(); -} - bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp) { - if (!peek_is_with_val(ahead, type)) { + if (!peek_is(ahead, type)) { return false; } switch (cmp) { @@ -350,4 +330,24 @@ std::optional Parser::parse_attr() } } -} \ No newline at end of file +} + +Utf8View::Utf8View(Utf8View::Iterator begin) + : m_begin(begin) + , m_end(Iterator(begin.base(), begin.base(), begin.base() + 1)) +{ +} + +Utf8View::Utf8View(Utf8View::Iterator begin, Utf8View::Iterator end) + : m_begin(begin) + , m_end(end) +{ +} +Utf8View::Iterator Utf8View::begin() const +{ + return m_begin; +} +Utf8View::Iterator Utf8View::end() const +{ + return m_end; +} diff --git a/src/html_parse.hpp b/src/html_parse.hpp index eb271d1..649de6c 100644 --- a/src/html_parse.hpp +++ b/src/html_parse.hpp @@ -8,6 +8,21 @@ #include +class Utf8View { +public: + using Iterator = utf8::iterator; + + explicit Utf8View(Iterator begin); + Utf8View(Iterator begin, Iterator end); + + [[nodiscard]] Iterator begin() const; + [[nodiscard]] Iterator end() const; + +private: + Iterator m_begin; + Iterator m_end; +}; + namespace html { enum class TokenType { @@ -34,7 +49,7 @@ enum class TokenType { struct Token { TokenType type; - std::optional value = {}; + Utf8View view; std::string to_string(); }; @@ -50,7 +65,7 @@ private: }; struct NodeDocType { - std::string type; + Utf8View type; }; struct NodeAttr { @@ -96,7 +111,6 @@ private: enum class StrCmp { case_sensitive, case_insensitive }; bool peek_is(size_t ahead, TokenType type); - bool peek_is_with_val(size_t ahead, TokenType type); bool peek_is(size_t ahead, TokenType type, const std::string& val, StrCmp cmp); Token& consume();