From 3407b86171db6eb70fddb7d42a8ce78af715ec9c Mon Sep 17 00:00:00 2001
From: orosmatthew <orosmatthew@pm.me>
Date: Wed, 4 Oct 2023 14:59:32 -0400
Subject: [PATCH] Initial utf8view

---
 src/html_parse.cpp | 102 ++++++++++++++++++++++-----------------------
 src/html_parse.hpp |  20 +++++++--
 2 files changed, 68 insertions(+), 54 deletions(-)
diff --git a/src/html_parse.cpp b/src/html_parse.cpp
index ad1b4e4..3e2d691 100644
--- a/src/html_parse.cpp
+++ b/src/html_parse.cpp
@@ -40,7 +40,6 @@ static bool is_newline(char32_t c)
 
 std::vector<Token> Tokenizer::tokenize()
 {
-    std::string buffer;
     std::vector<Token> tokens;
 
     utf8::iterator<std::string::iterator> it(m_source.begin(), m_source.begin(), m_source.end());
@@ -48,84 +47,77 @@ std::vector<Token> Tokenizer::tokenize()
 
     for (; it != end; ++it) {
         if (is_alpha(*it)) {
-            do {
-                utf8::append(*it, std::back_inserter(buffer));
-            } while (++it != end && (is_alpha_num(*it) || *it == '-'));
+            auto begin = it;
+            while (++it != end && (is_alpha_num(*it) || *it == '-')) { }
+            Token token { .type = TokenType::ident, .view { begin, it } };
             --it;
-            tokens.push_back({ .type = TokenType::ident, .value = buffer });
-            buffer.clear();
         }
         else if (*it == '<') {
-            tokens.push_back({ .type = TokenType::lt });
+            tokens.push_back({ .type = TokenType::lt, .view { it } });
         }
         else if (*it == '>') {
-            tokens.push_back({ .type = TokenType::gt });
+            tokens.push_back({ .type = TokenType::gt, .view { it } });
         }
         else if (*it == '!') {
-            tokens.push_back({ .type = TokenType::exclaim });
+            tokens.push_back({ .type = TokenType::exclaim, .view { it } });
         }
         else if (*it == '/') {
-            tokens.push_back({ .type = TokenType::fslash });
+            tokens.push_back({ .type = TokenType::fslash, .view { it } });
         }
         else if (*it == '=') {
-            tokens.push_back({ .type = TokenType::eq });
+            tokens.push_back({ .type = TokenType::eq, .view { it } });
         }
         else if (*it == '-') {
-            tokens.push_back({ .type = TokenType::minus });
+            tokens.push_back({ .type = TokenType::minus, .view { it } });
         }
         else if (*it == ':') {
-            tokens.push_back({ .type = TokenType::colon });
+            tokens.push_back({ .type = TokenType::colon, .view { it } });
         }
         else if (*it == '#') {
-            tokens.push_back({ .type = TokenType::hash });
+            tokens.push_back({ .type = TokenType::hash, .view { it } });
         }
         else if (*it == ';') {
-            tokens.push_back({ .type = TokenType::semi });
+            tokens.push_back({ .type = TokenType::semi, .view { it } });
         }
         else if (is_num(*it)) {
-            do {
-                utf8::append(*it, std::back_inserter(buffer));
-            } while (++it != end && is_num(*it));
+            auto begin = it;
+            while (++it != end && is_num(*it)) { };
+            tokens.push_back({ .type = TokenType::num, .view { begin, it } });
             --it;
-            tokens.push_back({ .type = TokenType::num, .value = buffer });
-            buffer.clear();
         }
         else if (*it == ',') {
-            tokens.push_back({ .type = TokenType::comma });
+            tokens.push_back({ .type = TokenType::comma, .view { it } });
         }
         else if (*it == '{') {
-            tokens.push_back({ .type = TokenType::left_curly });
+            tokens.push_back({ .type = TokenType::left_curly, .view { it } });
         }
         else if (*it == '}') {
-            tokens.push_back({ .type = TokenType::right_curly });
+            tokens.push_back({ .type = TokenType::right_curly, .view { it } });
         }
         else if (*it == '(') {
-            tokens.push_back({ .type = TokenType::left_paren });
+            tokens.push_back({ .type = TokenType::left_paren, .view { it } });
         }
         else if (*it == ')') {
-            tokens.push_back({ .type = TokenType::right_paren });
+            tokens.push_back({ .type = TokenType::right_paren, .view { it } });
         }
         else if (*it == '.') {
-            tokens.push_back({ .type = TokenType::dot });
+            tokens.push_back({ .type = TokenType::dot, .view { it } });
         }
         else if (*it == '@') {
-            tokens.push_back({ .type = TokenType::at });
+            tokens.push_back({ .type = TokenType::at, .view { it } });
         }
         else if (*it == '"') {
+            auto begin = it;
             ++it;
             while (it != end && *it != '"') {
-                utf8::append(*it, std::back_inserter(buffer));
                 ++it;
             }
-            tokens.push_back({ .type = TokenType::str, .value = buffer });
-            buffer.clear();
+            tokens.push_back({ .type = TokenType::str, .view { begin, it } });
         }
         else if (is_space(*it) || is_newline(*it)) {
         }
         else {
-            utf8::append(*it, std::back_inserter(buffer));
-            std::cout << "[WARN] Unexpected token: " << buffer << std::endl;
-            buffer.clear();
+            std::cout << "[WARN] Unexpected token" << std::endl;
         }
     }
 
@@ -140,13 +132,11 @@ std::string Token::to_string()
     case TokenType::gt:
         return ">";
     case TokenType::ident:
-        assert(value.has_value());
-        return value.value();
+        return "IDENT"; // TODO
     case TokenType::fslash:
         return "/";
     case TokenType::str:
-        assert(value.has_value());
-        return value.value();
+        return "STR"; // TODO
     case TokenType::exclaim:
         return "!";
     case TokenType::eq:
@@ -174,8 +164,7 @@ std::string Token::to_string()
     case TokenType::semi:
         return ";";
     case TokenType::num:
-        assert(value.has_value());
-        return value.value();
+        return "NUM"; // TODO
     default:
         assert(false && "Unimplemented");
         return "";
@@ -238,8 +227,7 @@ std::optional<NodeDocType> Parser::parse_doc_type()
         consume();
         consume();
         consume();
-        NodeDocType doc_type;
-        doc_type.type = consume().value.value();
+        NodeDocType doc_type { .type = consume().view };
         consume();
         return doc_type;
     }
@@ -254,17 +242,9 @@ bool Parser::peek_is(size_t ahead, TokenType type)
     return peek(ahead).value().get().type == type;
 }
 
-bool Parser::peek_is_with_val(size_t ahead, TokenType type)
-{
-    if (!peek_is(ahead, type)) {
-        return false;
-    }
-    return peek(ahead).value().get().value.has_value();
-}
-
 bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp)
 {
-    if (!peek_is_with_val(ahead, type)) {
+    if (!peek_is(ahead, type)) {
         return false;
     }
     switch (cmp) {
@@ -350,4 +330,24 @@ std::optional<NodeAttr> Parser::parse_attr()
     }
 }
 
-}
\ No newline at end of file
+}
+
+Utf8View::Utf8View(Utf8View::Iterator begin)
+    : m_begin(begin)
+    , m_end(Iterator(begin.base(), begin.base(), begin.base() + 1))
+{
+}
+
+Utf8View::Utf8View(Utf8View::Iterator begin, Utf8View::Iterator end)
+    : m_begin(begin)
+    , m_end(end)
+{
+}
+Utf8View::Iterator Utf8View::begin() const
+{
+    return m_begin;
+}
+Utf8View::Iterator Utf8View::end() const
+{
+    return m_end;
+}
diff --git a/src/html_parse.hpp b/src/html_parse.hpp
index eb271d1..649de6c 100644
--- a/src/html_parse.hpp
+++ b/src/html_parse.hpp
@@ -8,6 +8,21 @@
 
 #include <utf8.h>
 
+class Utf8View {
+public:
+    using Iterator = utf8::iterator<std::string::iterator>;
+
+    explicit Utf8View(Iterator begin);
+    Utf8View(Iterator begin, Iterator end);
+
+    [[nodiscard]] Iterator begin() const;
+    [[nodiscard]] Iterator end() const;
+
+private:
+    Iterator m_begin;
+    Iterator m_end;
+};
+
 namespace html {
 
 enum class TokenType {
@@ -34,7 +49,7 @@ enum class TokenType {
 
 struct Token {
     TokenType type;
-    std::optional<std::string> value = {};
+    Utf8View view;
 
     std::string to_string();
 };
@@ -50,7 +65,7 @@ private:
 };
 
 struct NodeDocType {
-    std::string type;
+    Utf8View type;
 };
 
 struct NodeAttr {
@@ -96,7 +111,6 @@ private:
     enum class StrCmp { case_sensitive, case_insensitive };
 
     bool peek_is(size_t ahead, TokenType type);
-    bool peek_is_with_val(size_t ahead, TokenType type);
     bool peek_is(size_t ahead, TokenType type, const std::string& val, StrCmp cmp);
 
     Token& consume();