More parsing implementation

This commit is contained in:
orosmatthew 2023-09-28 14:35:25 -04:00
parent 474d4305f7
commit a341997a17
3 changed files with 103 additions and 14 deletions

View File

@ -1,5 +1,6 @@
#include "html_parse.hpp" #include "html_parse.hpp"
#include <cassert>
#include <iostream> #include <iostream>
namespace html { namespace html {
@ -10,11 +11,44 @@ Tokenizer::Tokenizer(std::u32string source)
{ {
} }
static bool is_alpha(char32_t c)
{
return (c >= 65 && c <= 90) || (c >= 97 && c <= 122);
}
static bool is_num(char32_t c)
{
return c >= 48 && c <= 57;
}
static bool is_alpha_num(char32_t c)
{
return is_alpha(c) || is_num(c);
}
static bool is_space(char32_t c)
{
return c == 32;
}
static bool is_newline(char32_t c)
{
return c == 10;
}
std::vector<Token> Tokenizer::tokenize() std::vector<Token> Tokenizer::tokenize()
{ {
std::u32string buffer;
std::vector<Token> tokens; std::vector<Token> tokens;
while (peek().has_value()) { while (peek().has_value()) {
if (peek().value() == '<') { if (is_alpha(peek().value())) {
do {
buffer.push_back(consume());
} while (peek().has_value() && is_alpha_num(peek().value()));
tokens.push_back({ .type = TokenType::ident, .value = buffer });
buffer.clear();
}
else if (peek().value() == '<') {
consume(); consume();
tokens.push_back({ .type = TokenType::lt }); tokens.push_back({ .type = TokenType::lt });
} }
@ -22,13 +56,35 @@ std::vector<Token> Tokenizer::tokenize()
consume(); consume();
tokens.push_back({ .type = TokenType::gt }); tokens.push_back({ .type = TokenType::gt });
} }
else if (peek().value() == '!') {
consume();
tokens.push_back({ .type = TokenType::exclaim });
}
else if (peek().value() == '/') {
consume();
tokens.push_back({ .type = TokenType::fslash });
}
else if (peek().value() == '=') {
consume();
tokens.push_back({ .type = TokenType::eq });
}
else if (peek().value() == '"') {
do {
buffer.push_back(consume());
} while (peek().has_value() && peek().value() != '"');
tokens.push_back({ .type = TokenType::str, .value = buffer });
buffer.clear();
}
else if (is_space(peek().value()) || is_newline(peek().value())) {
consume();
}
else { else {
std::cout << "[WARN] Unexpected token: " << std::to_string(peek().value()) << std::endl; std::cout << "[WARN] Unexpected token: " << std::to_string(peek().value()) << std::endl;
consume(); consume();
} }
} }
return {}; return tokens;
} }
std::optional<char32_t> Tokenizer::peek(size_t ahead) std::optional<char32_t> Tokenizer::peek(size_t ahead)
{ {
@ -43,4 +99,35 @@ char32_t Tokenizer::consume()
return m_source.at(m_index++); return m_source.at(m_index++);
} }
std::u32string Token::to_string()
{
std::u32string buffer;
switch (type) {
case TokenType::lt:
buffer.push_back('<');
return buffer;
case TokenType::gt:
buffer.push_back('>');
return buffer;
case TokenType::ident:
assert(value.has_value());
return value.value();
case TokenType::fslash:
buffer.push_back('/');
return buffer;
case TokenType::str:
assert(value.has_value());
return value.value();
case TokenType::exclaim:
buffer.push_back('!');
return buffer;
case TokenType::eq:
buffer.push_back('=');
return buffer;
default:
assert(false && "Unimplemented");
return buffer;
}
} }
}

View File

@ -6,17 +6,13 @@
namespace html { namespace html {
enum class TokenType { enum class TokenType { lt, gt, ident, fslash, str, exclaim, eq };
lt,
gt,
ident,
fslash,
quote,
};
struct Token { struct Token {
TokenType type; TokenType type;
std::optional<std::string> value = {}; std::optional<std::u32string> value = {};
std::u32string to_string();
}; };
enum class NodeType { enum class NodeType {

View File

@ -1,13 +1,15 @@
#include "fetch.hpp" #include "fetch.hpp"
#include <codecvt>
#include <iostream>
#include <optional> #include <optional>
#include "html_parse.hpp"
#define RAYGUI_IMPLEMENTATION #define RAYGUI_IMPLEMENTATION
#include <raygui.h> #include <raygui.h>
#include <raylib-cpp.hpp> #include <raylib-cpp.hpp>
#include "html_parse.hpp"
int main() int main()
{ {
init_curl(); init_curl();
@ -32,7 +34,11 @@ int main()
if (page_data.has_value()) { if (page_data.has_value()) {
std::u32string page_data_unicode(page_data.value().begin(), page_data.value().end()); std::u32string page_data_unicode(page_data.value().begin(), page_data.value().end());
html::Tokenizer tokenizer(page_data_unicode); html::Tokenizer tokenizer(page_data_unicode);
tokenizer.tokenize(); std::vector<html::Token> tokens = tokenizer.tokenize();
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> converter;
for (html::Token token : tokens) {
std::cout << converter.to_bytes(token.to_string()) << std::endl;
}
} }
while (!window.ShouldClose()) { while (!window.ShouldClose()) {
@ -54,7 +60,7 @@ int main()
scroll_pos += GetMouseWheelMove(); scroll_pos += GetMouseWheelMove();
if (page_data.has_value()) { if (page_data.has_value()) {
DrawTextEx(sans_font, page_data.value(), { 0, 20 + scroll_pos * 20 }, 24, 1.0f, BLACK); DrawTextEx(sans_font, page_data.value(), { 0, 20 + scroll_pos * 40 }, 24, 1.0f, BLACK);
} }
EndDrawing(); EndDrawing();