More parsing implementation

2023-09-28 14:35:25 -04:00 · 2023-09-28 14:35:25 -04:00 · a341997a17
commit a341997a17
parent 474d4305f7
3 changed files with 103 additions and 14 deletions
--- a/src/html_parse.cpp
+++ b/src/html_parse.cpp
@ -1,5 +1,6 @@
 #include "html_parse.hpp"
 #include <cassert>
 #include <iostream>
 namespace html {
@ -10,11 +11,44 @@ Tokenizer::Tokenizer(std::u32string source)
 {
 }
 static bool is_alpha(char32_t c)
 {
    return (c >= 65 && c <= 90) || (c >= 97 && c <= 122);
 }
 static bool is_num(char32_t c)
 {
    return c >= 48 && c <= 57;
 }
 static bool is_alpha_num(char32_t c)
 {
    return is_alpha(c) || is_num(c);
 }
 static bool is_space(char32_t c)
 {
    return c == 32;
 }
 static bool is_newline(char32_t c)
 {
    return c == 10;
 }
 std::vector<Token> Tokenizer::tokenize()
 {
    std::u32string buffer;
    std::vector<Token> tokens;
    while (peek().has_value()) {
-        if (peek().value() == '<') {
+        if (is_alpha(peek().value())) {
            do {
                buffer.push_back(consume());
            } while (peek().has_value() && is_alpha_num(peek().value()));
            tokens.push_back({ .type = TokenType::ident, .value = buffer });
            buffer.clear();
        }
        else if (peek().value() == '<') {
            consume();
            tokens.push_back({ .type = TokenType::lt });
        }
@ -22,13 +56,35 @@ std::vector<Token> Tokenizer::tokenize()
            consume();
            tokens.push_back({ .type = TokenType::gt });
        }
        else if (peek().value() == '!') {
            consume();
            tokens.push_back({ .type = TokenType::exclaim });
        }
        else if (peek().value() == '/') {
            consume();
            tokens.push_back({ .type = TokenType::fslash });
        }
        else if (peek().value() == '=') {
            consume();
            tokens.push_back({ .type = TokenType::eq });
        }
        else if (peek().value() == '"') {
            do {
                buffer.push_back(consume());
            } while (peek().has_value() && peek().value() != '"');
            tokens.push_back({ .type = TokenType::str, .value = buffer });
            buffer.clear();
        }
        else if (is_space(peek().value()) || is_newline(peek().value())) {
            consume();
        }
        else {
            std::cout << "[WARN] Unexpected token: " << std::to_string(peek().value()) << std::endl;
            consume();
        }
    }
-    return {};
+    return tokens;
 }
 std::optional<char32_t> Tokenizer::peek(size_t ahead)
 {
@ -43,4 +99,35 @@ char32_t Tokenizer::consume()
    return m_source.at(m_index++);
 }
 std::u32string Token::to_string()
 {
    std::u32string buffer;
    switch (type) {
    case TokenType::lt:
        buffer.push_back('<');
        return buffer;
    case TokenType::gt:
        buffer.push_back('>');
        return buffer;
    case TokenType::ident:
        assert(value.has_value());
        return value.value();
    case TokenType::fslash:
        buffer.push_back('/');
        return buffer;
    case TokenType::str:
        assert(value.has_value());
        return value.value();
    case TokenType::exclaim:
        buffer.push_back('!');
        return buffer;
    case TokenType::eq:
        buffer.push_back('=');
        return buffer;
    default:
        assert(false && "Unimplemented");
        return buffer;
    }
 }
 }
--- a/src/html_parse.hpp
+++ b/src/html_parse.hpp
@ -6,17 +6,13 @@
 namespace html {
-enum class TokenType {
+enum class TokenType { lt, gt, ident, fslash, str, exclaim, eq };
    lt,
    gt,
    ident,
    fslash,
    quote,
 };
 struct Token {
    TokenType type;
-    std::optional<std::string> value = {};
+    std::optional<std::u32string> value = {};
    std::u32string to_string();
 };
 enum class NodeType {
--- a/src/main.cpp
+++ b/src/main.cpp
@ -1,13 +1,15 @@
 #include "fetch.hpp"
 #include <codecvt>
 #include <iostream>
 #include <optional>
 #include "html_parse.hpp"
 #define RAYGUI_IMPLEMENTATION
 #include <raygui.h>
 #include <raylib-cpp.hpp>
 #include "html_parse.hpp"
 int main()
 {
    init_curl();
@ -32,7 +34,11 @@ int main()
    if (page_data.has_value()) {
        std::u32string page_data_unicode(page_data.value().begin(), page_data.value().end());
        html::Tokenizer tokenizer(page_data_unicode);
-        tokenizer.tokenize();
+        std::vector<html::Token> tokens = tokenizer.tokenize();
        std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
        for (html::Token token : tokens) {
            std::cout << converter.to_bytes(token.to_string()) << std::endl;
        }
    }
    while (!window.ShouldClose()) {
@ -54,7 +60,7 @@ int main()
        scroll_pos += GetMouseWheelMove();
        if (page_data.has_value()) {
-            DrawTextEx(sans_font, page_data.value(), { 0, 20 + scroll_pos * 20 }, 24, 1.0f, BLACK);
+            DrawTextEx(sans_font, page_data.value(), { 0, 20 + scroll_pos * 40 }, 24, 1.0f, BLACK);
        }
        EndDrawing();