Initial utf8view

2023-10-04 14:59:32 -04:00 · 2023-10-04 14:59:32 -04:00 · 3407b86171
commit 3407b86171
parent b537916c6b
2 changed files with 68 additions and 54 deletions
--- a/src/html_parse.cpp
+++ b/src/html_parse.cpp
@ -40,7 +40,6 @@ static bool is_newline(char32_t c)

 std::vector<Token> Tokenizer::tokenize()
 {
-    std::string buffer;
    std::vector<Token> tokens;

    utf8::iterator<std::string::iterator> it(m_source.begin(), m_source.begin(), m_source.end());
@ -48,84 +47,77 @@ std::vector<Token> Tokenizer::tokenize()

    for (; it != end; ++it) {
        if (is_alpha(*it)) {
-            do {
-                utf8::append(*it, std::back_inserter(buffer));
-            } while (++it != end && (is_alpha_num(*it) || *it == '-'));
+            auto begin = it;
+            while (++it != end && (is_alpha_num(*it) || *it == '-')) { }
+            Token token { .type = TokenType::ident, .view { begin, it } };
            --it;
-            tokens.push_back({ .type = TokenType::ident, .value = buffer });
-            buffer.clear();
        }
        else if (*it == '<') {
-            tokens.push_back({ .type = TokenType::lt });
+            tokens.push_back({ .type = TokenType::lt, .view { it } });
        }
        else if (*it == '>') {
-            tokens.push_back({ .type = TokenType::gt });
+            tokens.push_back({ .type = TokenType::gt, .view { it } });
        }
        else if (*it == '!') {
-            tokens.push_back({ .type = TokenType::exclaim });
+            tokens.push_back({ .type = TokenType::exclaim, .view { it } });
        }
        else if (*it == '/') {
-            tokens.push_back({ .type = TokenType::fslash });
+            tokens.push_back({ .type = TokenType::fslash, .view { it } });
        }
        else if (*it == '=') {
-            tokens.push_back({ .type = TokenType::eq });
+            tokens.push_back({ .type = TokenType::eq, .view { it } });
        }
        else if (*it == '-') {
-            tokens.push_back({ .type = TokenType::minus });
+            tokens.push_back({ .type = TokenType::minus, .view { it } });
        }
        else if (*it == ':') {
-            tokens.push_back({ .type = TokenType::colon });
+            tokens.push_back({ .type = TokenType::colon, .view { it } });
        }
        else if (*it == '#') {
-            tokens.push_back({ .type = TokenType::hash });
+            tokens.push_back({ .type = TokenType::hash, .view { it } });
        }
        else if (*it == ';') {
-            tokens.push_back({ .type = TokenType::semi });
+            tokens.push_back({ .type = TokenType::semi, .view { it } });
        }
        else if (is_num(*it)) {
-            do {
-                utf8::append(*it, std::back_inserter(buffer));
-            } while (++it != end && is_num(*it));
+            auto begin = it;
+            while (++it != end && is_num(*it)) { };
+            tokens.push_back({ .type = TokenType::num, .view { begin, it } });
            --it;
-            tokens.push_back({ .type = TokenType::num, .value = buffer });
-            buffer.clear();
        }
        else if (*it == ',') {
-            tokens.push_back({ .type = TokenType::comma });
+            tokens.push_back({ .type = TokenType::comma, .view { it } });
        }
        else if (*it == '{') {
-            tokens.push_back({ .type = TokenType::left_curly });
+            tokens.push_back({ .type = TokenType::left_curly, .view { it } });
        }
        else if (*it == '}') {
-            tokens.push_back({ .type = TokenType::right_curly });
+            tokens.push_back({ .type = TokenType::right_curly, .view { it } });
        }
        else if (*it == '(') {
-            tokens.push_back({ .type = TokenType::left_paren });
+            tokens.push_back({ .type = TokenType::left_paren, .view { it } });
        }
        else if (*it == ')') {
-            tokens.push_back({ .type = TokenType::right_paren });
+            tokens.push_back({ .type = TokenType::right_paren, .view { it } });
        }
        else if (*it == '.') {
-            tokens.push_back({ .type = TokenType::dot });
+            tokens.push_back({ .type = TokenType::dot, .view { it } });
        }
        else if (*it == '@') {
-            tokens.push_back({ .type = TokenType::at });
+            tokens.push_back({ .type = TokenType::at, .view { it } });
        }
        else if (*it == '"') {
+            auto begin = it;
            ++it;
            while (it != end && *it != '"') {
-                utf8::append(*it, std::back_inserter(buffer));
                ++it;
            }
-            tokens.push_back({ .type = TokenType::str, .value = buffer });
-            buffer.clear();
+            tokens.push_back({ .type = TokenType::str, .view { begin, it } });
        }
        else if (is_space(*it) || is_newline(*it)) {
        }
        else {
-            utf8::append(*it, std::back_inserter(buffer));
-            std::cout << "[WARN] Unexpected token: " << buffer << std::endl;
-            buffer.clear();
+            std::cout << "[WARN] Unexpected token" << std::endl;
        }
    }

@ -140,13 +132,11 @@ std::string Token::to_string()
    case TokenType::gt:
        return ">";
    case TokenType::ident:
-        assert(value.has_value());
-        return value.value();
+        return "IDENT"; // TODO
    case TokenType::fslash:
        return "/";
    case TokenType::str:
-        assert(value.has_value());
-        return value.value();
+        return "STR"; // TODO
    case TokenType::exclaim:
        return "!";
    case TokenType::eq:
@ -174,8 +164,7 @@ std::string Token::to_string()
    case TokenType::semi:
        return ";";
    case TokenType::num:
-        assert(value.has_value());
-        return value.value();
+        return "NUM"; // TODO
    default:
        assert(false && "Unimplemented");
        return "";
@ -238,8 +227,7 @@ std::optional<NodeDocType> Parser::parse_doc_type()
        consume();
        consume();
        consume();
-        NodeDocType doc_type;
-        doc_type.type = consume().value.value();
+        NodeDocType doc_type { .type = consume().view };
        consume();
        return doc_type;
    }
@ -254,17 +242,9 @@ bool Parser::peek_is(size_t ahead, TokenType type)
    return peek(ahead).value().get().type == type;
 }

-bool Parser::peek_is_with_val(size_t ahead, TokenType type)
-{
-    if (!peek_is(ahead, type)) {
-        return false;
-    }
-    return peek(ahead).value().get().value.has_value();
-}
-
 bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp)
 {
-    if (!peek_is_with_val(ahead, type)) {
+    if (!peek_is(ahead, type)) {
        return false;
    }
    switch (cmp) {
@ -351,3 +331,23 @@ std::optional<NodeAttr> Parser::parse_attr()
 }

 }
+
+Utf8View::Utf8View(Utf8View::Iterator begin)
+    : m_begin(begin)
+    , m_end(Iterator(begin.base(), begin.base(), begin.base() + 1))
+{
+}
+
+Utf8View::Utf8View(Utf8View::Iterator begin, Utf8View::Iterator end)
+    : m_begin(begin)
+    , m_end(end)
+{
+}
+Utf8View::Iterator Utf8View::begin() const
+{
+    return m_begin;
+}
+Utf8View::Iterator Utf8View::end() const
+{
+    return m_end;
+}
--- a/src/html_parse.hpp
+++ b/src/html_parse.hpp
@ -8,6 +8,21 @@

 #include <utf8.h>

+class Utf8View {
+public:
+    using Iterator = utf8::iterator<std::string::iterator>;
+
+    explicit Utf8View(Iterator begin);
+    Utf8View(Iterator begin, Iterator end);
+
+    [[nodiscard]] Iterator begin() const;
+    [[nodiscard]] Iterator end() const;
+
+private:
+    Iterator m_begin;
+    Iterator m_end;
+};
+
 namespace html {

 enum class TokenType {
@ -34,7 +49,7 @@ enum class TokenType {

 struct Token {
    TokenType type;
-    std::optional<std::string> value = {};
+    Utf8View view;

    std::string to_string();
 };
@ -50,7 +65,7 @@ private:
 };

 struct NodeDocType {
-    std::string type;
+    Utf8View type;
 };

 struct NodeAttr {
@ -96,7 +111,6 @@ private:
    enum class StrCmp { case_sensitive, case_insensitive };

    bool peek_is(size_t ahead, TokenType type);
-    bool peek_is_with_val(size_t ahead, TokenType type);
    bool peek_is(size_t ahead, TokenType type, const std::string& val, StrCmp cmp);

    Token& consume();