From a341997a17fa97a233cc8f5397fc17359dc25c0e Mon Sep 17 00:00:00 2001 From: orosmatthew Date: Thu, 28 Sep 2023 14:35:25 -0400 Subject: [PATCH] More parsing implementation --- src/html_parse.cpp | 91 +++++++++++++++++++++++++++++++++++++++++++++- src/html_parse.hpp | 12 ++---- src/main.cpp | 14 +++++-- 3 files changed, 103 insertions(+), 14 deletions(-) diff --git a/src/html_parse.cpp b/src/html_parse.cpp index 3da909a..45b1188 100644 --- a/src/html_parse.cpp +++ b/src/html_parse.cpp @@ -1,5 +1,6 @@ #include "html_parse.hpp" +#include #include namespace html { @@ -10,11 +11,44 @@ Tokenizer::Tokenizer(std::u32string source) { } +static bool is_alpha(char32_t c) +{ + return (c >= 65 && c <= 90) || (c >= 97 && c <= 122); +} + +static bool is_num(char32_t c) +{ + return c >= 48 && c <= 57; +} + +static bool is_alpha_num(char32_t c) +{ + return is_alpha(c) || is_num(c); +} + +static bool is_space(char32_t c) +{ + return c == 32; +} + +static bool is_newline(char32_t c) +{ + return c == 10; +} + std::vector Tokenizer::tokenize() { + std::u32string buffer; std::vector tokens; while (peek().has_value()) { - if (peek().value() == '<') { + if (is_alpha(peek().value())) { + do { + buffer.push_back(consume()); + } while (peek().has_value() && is_alpha_num(peek().value())); + tokens.push_back({ .type = TokenType::ident, .value = buffer }); + buffer.clear(); + } + else if (peek().value() == '<') { consume(); tokens.push_back({ .type = TokenType::lt }); } @@ -22,13 +56,35 @@ std::vector Tokenizer::tokenize() consume(); tokens.push_back({ .type = TokenType::gt }); } + else if (peek().value() == '!') { + consume(); + tokens.push_back({ .type = TokenType::exclaim }); + } + else if (peek().value() == '/') { + consume(); + tokens.push_back({ .type = TokenType::fslash }); + } + else if (peek().value() == '=') { + consume(); + tokens.push_back({ .type = TokenType::eq }); + } + else if (peek().value() == '"') { + do { + buffer.push_back(consume()); + } while (peek().has_value() && peek().value() != '"'); + tokens.push_back({ .type = TokenType::str, .value = buffer }); + buffer.clear(); + } + else if (is_space(peek().value()) || is_newline(peek().value())) { + consume(); + } else { std::cout << "[WARN] Unexpected token: " << std::to_string(peek().value()) << std::endl; consume(); } } - return {}; + return tokens; } std::optional Tokenizer::peek(size_t ahead) { @@ -43,4 +99,35 @@ char32_t Tokenizer::consume() return m_source.at(m_index++); } +std::u32string Token::to_string() +{ + std::u32string buffer; + switch (type) { + case TokenType::lt: + buffer.push_back('<'); + return buffer; + case TokenType::gt: + buffer.push_back('>'); + return buffer; + case TokenType::ident: + assert(value.has_value()); + return value.value(); + case TokenType::fslash: + buffer.push_back('/'); + return buffer; + case TokenType::str: + assert(value.has_value()); + return value.value(); + case TokenType::exclaim: + buffer.push_back('!'); + return buffer; + case TokenType::eq: + buffer.push_back('='); + return buffer; + default: + assert(false && "Unimplemented"); + return buffer; + } } + +} \ No newline at end of file diff --git a/src/html_parse.hpp b/src/html_parse.hpp index c011c84..b72620a 100644 --- a/src/html_parse.hpp +++ b/src/html_parse.hpp @@ -6,17 +6,13 @@ namespace html { -enum class TokenType { - lt, - gt, - ident, - fslash, - quote, -}; +enum class TokenType { lt, gt, ident, fslash, str, exclaim, eq }; struct Token { TokenType type; - std::optional value = {}; + std::optional value = {}; + + std::u32string to_string(); }; enum class NodeType { diff --git a/src/main.cpp b/src/main.cpp index c91984a..520ec39 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,13 +1,15 @@ #include "fetch.hpp" +#include +#include #include -#include "html_parse.hpp" - #define RAYGUI_IMPLEMENTATION #include #include +#include "html_parse.hpp" + int main() { init_curl(); @@ -32,7 +34,11 @@ int main() if (page_data.has_value()) { std::u32string page_data_unicode(page_data.value().begin(), page_data.value().end()); html::Tokenizer tokenizer(page_data_unicode); - tokenizer.tokenize(); + std::vector tokens = tokenizer.tokenize(); + std::wstring_convert, char32_t> converter; + for (html::Token token : tokens) { + std::cout << converter.to_bytes(token.to_string()) << std::endl; + } } while (!window.ShouldClose()) { @@ -54,7 +60,7 @@ int main() scroll_pos += GetMouseWheelMove(); if (page_data.has_value()) { - DrawTextEx(sans_font, page_data.value(), { 0, 20 + scroll_pos * 20 }, 24, 1.0f, BLACK); + DrawTextEx(sans_font, page_data.value(), { 0, 20 + scroll_pos * 40 }, 24, 1.0f, BLACK); } EndDrawing();