Initial utf8view

2023-10-04 14:59:32 -04:00 · 2023-10-04 14:59:32 -04:00 · 3407b86171
commit 3407b86171
parent b537916c6b
2 changed files with 68 additions and 54 deletions
--- a/src/html_parse.cpp
+++ b/src/html_parse.cpp
@ -40,7 +40,6 @@ static bool is_newline(char32_t c)
 std::vector<Token> Tokenizer::tokenize()
 {
    std::string buffer;
    std::vector<Token> tokens;
    utf8::iterator<std::string::iterator> it(m_source.begin(), m_source.begin(), m_source.end());
@ -48,84 +47,77 @@ std::vector<Token> Tokenizer::tokenize()
    for (; it != end; ++it) {
        if (is_alpha(*it)) {
-            do {
+            auto begin = it;
-                utf8::append(*it, std::back_inserter(buffer));
+            while (++it != end && (is_alpha_num(*it) || *it == '-')) { }
-            } while (++it != end && (is_alpha_num(*it) || *it == '-'));
+            Token token { .type = TokenType::ident, .view { begin, it } };
            --it;
            tokens.push_back({ .type = TokenType::ident, .value = buffer });
            buffer.clear();
        }
        else if (*it == '<') {
-            tokens.push_back({ .type = TokenType::lt });
+            tokens.push_back({ .type = TokenType::lt, .view { it } });
        }
        else if (*it == '>') {
-            tokens.push_back({ .type = TokenType::gt });
+            tokens.push_back({ .type = TokenType::gt, .view { it } });
        }
        else if (*it == '!') {
-            tokens.push_back({ .type = TokenType::exclaim });
+            tokens.push_back({ .type = TokenType::exclaim, .view { it } });
        }
        else if (*it == '/') {
-            tokens.push_back({ .type = TokenType::fslash });
+            tokens.push_back({ .type = TokenType::fslash, .view { it } });
        }
        else if (*it == '=') {
-            tokens.push_back({ .type = TokenType::eq });
+            tokens.push_back({ .type = TokenType::eq, .view { it } });
        }
        else if (*it == '-') {
-            tokens.push_back({ .type = TokenType::minus });
+            tokens.push_back({ .type = TokenType::minus, .view { it } });
        }
        else if (*it == ':') {
-            tokens.push_back({ .type = TokenType::colon });
+            tokens.push_back({ .type = TokenType::colon, .view { it } });
        }
        else if (*it == '#') {
-            tokens.push_back({ .type = TokenType::hash });
+            tokens.push_back({ .type = TokenType::hash, .view { it } });
        }
        else if (*it == ';') {
-            tokens.push_back({ .type = TokenType::semi });
+            tokens.push_back({ .type = TokenType::semi, .view { it } });
        }
        else if (is_num(*it)) {
-            do {
+            auto begin = it;
-                utf8::append(*it, std::back_inserter(buffer));
+            while (++it != end && is_num(*it)) { };
-            } while (++it != end && is_num(*it));
+            tokens.push_back({ .type = TokenType::num, .view { begin, it } });
            --it;
            tokens.push_back({ .type = TokenType::num, .value = buffer });
            buffer.clear();
        }
        else if (*it == ',') {
-            tokens.push_back({ .type = TokenType::comma });
+            tokens.push_back({ .type = TokenType::comma, .view { it } });
        }
        else if (*it == '{') {
-            tokens.push_back({ .type = TokenType::left_curly });
+            tokens.push_back({ .type = TokenType::left_curly, .view { it } });
        }
        else if (*it == '}') {
-            tokens.push_back({ .type = TokenType::right_curly });
+            tokens.push_back({ .type = TokenType::right_curly, .view { it } });
        }
        else if (*it == '(') {
-            tokens.push_back({ .type = TokenType::left_paren });
+            tokens.push_back({ .type = TokenType::left_paren, .view { it } });
        }
        else if (*it == ')') {
-            tokens.push_back({ .type = TokenType::right_paren });
+            tokens.push_back({ .type = TokenType::right_paren, .view { it } });
        }
        else if (*it == '.') {
-            tokens.push_back({ .type = TokenType::dot });
+            tokens.push_back({ .type = TokenType::dot, .view { it } });
        }
        else if (*it == '@') {
-            tokens.push_back({ .type = TokenType::at });
+            tokens.push_back({ .type = TokenType::at, .view { it } });
        }
        else if (*it == '"') {
            auto begin = it;
            ++it;
            while (it != end && *it != '"') {
                utf8::append(*it, std::back_inserter(buffer));
                ++it;
            }
-            tokens.push_back({ .type = TokenType::str, .value = buffer });
+            tokens.push_back({ .type = TokenType::str, .view { begin, it } });
            buffer.clear();
        }
        else if (is_space(*it) || is_newline(*it)) {
        }
        else {
-            utf8::append(*it, std::back_inserter(buffer));
+            std::cout << "[WARN] Unexpected token" << std::endl;
            std::cout << "[WARN] Unexpected token: " << buffer << std::endl;
            buffer.clear();
        }
    }
@ -140,13 +132,11 @@ std::string Token::to_string()
    case TokenType::gt:
        return ">";
    case TokenType::ident:
-        assert(value.has_value());
+        return "IDENT"; // TODO
        return value.value();
    case TokenType::fslash:
        return "/";
    case TokenType::str:
-        assert(value.has_value());
+        return "STR"; // TODO
        return value.value();
    case TokenType::exclaim:
        return "!";
    case TokenType::eq:
@ -174,8 +164,7 @@ std::string Token::to_string()
    case TokenType::semi:
        return ";";
    case TokenType::num:
-        assert(value.has_value());
+        return "NUM"; // TODO
        return value.value();
    default:
        assert(false && "Unimplemented");
        return "";
@ -238,8 +227,7 @@ std::optional<NodeDocType> Parser::parse_doc_type()
        consume();
        consume();
        consume();
-        NodeDocType doc_type;
+        NodeDocType doc_type { .type = consume().view };
        doc_type.type = consume().value.value();
        consume();
        return doc_type;
    }
@ -254,17 +242,9 @@ bool Parser::peek_is(size_t ahead, TokenType type)
    return peek(ahead).value().get().type == type;
 }
 bool Parser::peek_is_with_val(size_t ahead, TokenType type)
 {
    if (!peek_is(ahead, type)) {
        return false;
    }
    return peek(ahead).value().get().value.has_value();
 }
 bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp)
 {
-    if (!peek_is_with_val(ahead, type)) {
+    if (!peek_is(ahead, type)) {
        return false;
    }
    switch (cmp) {
@ -350,4 +330,24 @@ std::optional<NodeAttr> Parser::parse_attr()
    }
 }
-}
+}
 Utf8View::Utf8View(Utf8View::Iterator begin)
    : m_begin(begin)
    , m_end(Iterator(begin.base(), begin.base(), begin.base() + 1))
 {
 }
 Utf8View::Utf8View(Utf8View::Iterator begin, Utf8View::Iterator end)
    : m_begin(begin)
    , m_end(end)
 {
 }
 Utf8View::Iterator Utf8View::begin() const
 {
    return m_begin;
 }
 Utf8View::Iterator Utf8View::end() const
 {
    return m_end;
 }
--- a/src/html_parse.hpp
+++ b/src/html_parse.hpp
@ -8,6 +8,21 @@
 #include <utf8.h>
 class Utf8View {
 public:
    using Iterator = utf8::iterator<std::string::iterator>;
    explicit Utf8View(Iterator begin);
    Utf8View(Iterator begin, Iterator end);
    [[nodiscard]] Iterator begin() const;
    [[nodiscard]] Iterator end() const;
 private:
    Iterator m_begin;
    Iterator m_end;
 };
 namespace html {
 enum class TokenType {
@ -34,7 +49,7 @@ enum class TokenType {
 struct Token {
    TokenType type;
-    std::optional<std::string> value = {};
+    Utf8View view;
    std::string to_string();
 };
@ -50,7 +65,7 @@ private:
 };
 struct NodeDocType {
-    std::string type;
+    Utf8View type;
 };
 struct NodeAttr {
@ -96,7 +111,6 @@ private:
    enum class StrCmp { case_sensitive, case_insensitive };
    bool peek_is(size_t ahead, TokenType type);
    bool peek_is_with_val(size_t ahead, TokenType type);
    bool peek_is(size_t ahead, TokenType type, const std::string& val, StrCmp cmp);
    Token& consume();