diff --git a/src/html_parse.cpp b/src/html_parse.cpp
index 3da909a..45b1188 100644
--- a/src/html_parse.cpp
+++ b/src/html_parse.cpp
@@ -1,5 +1,6 @@
#include "html_parse.hpp"
+#include
#include
namespace html {
@@ -10,11 +11,44 @@ Tokenizer::Tokenizer(std::u32string source)
{
}
+static bool is_alpha(char32_t c)
+{
+ return (c >= 65 && c <= 90) || (c >= 97 && c <= 122);
+}
+
+static bool is_num(char32_t c)
+{
+ return c >= 48 && c <= 57;
+}
+
+static bool is_alpha_num(char32_t c)
+{
+ return is_alpha(c) || is_num(c);
+}
+
+static bool is_space(char32_t c)
+{
+ return c == 32;
+}
+
+static bool is_newline(char32_t c)
+{
+ return c == 10;
+}
+
std::vector Tokenizer::tokenize()
{
+ std::u32string buffer;
std::vector tokens;
while (peek().has_value()) {
- if (peek().value() == '<') {
+ if (is_alpha(peek().value())) {
+ do {
+ buffer.push_back(consume());
+ } while (peek().has_value() && is_alpha_num(peek().value()));
+ tokens.push_back({ .type = TokenType::ident, .value = buffer });
+ buffer.clear();
+ }
+ else if (peek().value() == '<') {
consume();
tokens.push_back({ .type = TokenType::lt });
}
@@ -22,13 +56,35 @@ std::vector Tokenizer::tokenize()
consume();
tokens.push_back({ .type = TokenType::gt });
}
+ else if (peek().value() == '!') {
+ consume();
+ tokens.push_back({ .type = TokenType::exclaim });
+ }
+ else if (peek().value() == '/') {
+ consume();
+ tokens.push_back({ .type = TokenType::fslash });
+ }
+ else if (peek().value() == '=') {
+ consume();
+ tokens.push_back({ .type = TokenType::eq });
+ }
+ else if (peek().value() == '"') {
+ do {
+ buffer.push_back(consume());
+ } while (peek().has_value() && peek().value() != '"');
+ tokens.push_back({ .type = TokenType::str, .value = buffer });
+ buffer.clear();
+ }
+ else if (is_space(peek().value()) || is_newline(peek().value())) {
+ consume();
+ }
else {
std::cout << "[WARN] Unexpected token: " << std::to_string(peek().value()) << std::endl;
consume();
}
}
- return {};
+ return tokens;
}
std::optional Tokenizer::peek(size_t ahead)
{
@@ -43,4 +99,35 @@ char32_t Tokenizer::consume()
return m_source.at(m_index++);
}
+std::u32string Token::to_string()
+{
+ std::u32string buffer;
+ switch (type) {
+ case TokenType::lt:
+ buffer.push_back('<');
+ return buffer;
+ case TokenType::gt:
+ buffer.push_back('>');
+ return buffer;
+ case TokenType::ident:
+ assert(value.has_value());
+ return value.value();
+ case TokenType::fslash:
+ buffer.push_back('/');
+ return buffer;
+ case TokenType::str:
+ assert(value.has_value());
+ return value.value();
+ case TokenType::exclaim:
+ buffer.push_back('!');
+ return buffer;
+ case TokenType::eq:
+ buffer.push_back('=');
+ return buffer;
+ default:
+ assert(false && "Unimplemented");
+ return buffer;
+ }
}
+
+}
\ No newline at end of file
diff --git a/src/html_parse.hpp b/src/html_parse.hpp
index c011c84..b72620a 100644
--- a/src/html_parse.hpp
+++ b/src/html_parse.hpp
@@ -6,17 +6,13 @@
namespace html {
-enum class TokenType {
- lt,
- gt,
- ident,
- fslash,
- quote,
-};
+enum class TokenType { lt, gt, ident, fslash, str, exclaim, eq };
struct Token {
TokenType type;
- std::optional value = {};
+ std::optional value = {};
+
+ std::u32string to_string();
};
enum class NodeType {
diff --git a/src/main.cpp b/src/main.cpp
index c91984a..520ec39 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,13 +1,15 @@
#include "fetch.hpp"
+#include
+#include
#include
-#include "html_parse.hpp"
-
#define RAYGUI_IMPLEMENTATION
#include
#include
+#include "html_parse.hpp"
+
int main()
{
init_curl();
@@ -32,7 +34,11 @@ int main()
if (page_data.has_value()) {
std::u32string page_data_unicode(page_data.value().begin(), page_data.value().end());
html::Tokenizer tokenizer(page_data_unicode);
- tokenizer.tokenize();
+ std::vector tokens = tokenizer.tokenize();
+ std::wstring_convert, char32_t> converter;
+ for (html::Token token : tokens) {
+ std::cout << converter.to_bytes(token.to_string()) << std::endl;
+ }
}
while (!window.ShouldClose()) {
@@ -54,7 +60,7 @@ int main()
scroll_pos += GetMouseWheelMove();
if (page_data.has_value()) {
- DrawTextEx(sans_font, page_data.value(), { 0, 20 + scroll_pos * 20 }, 24, 1.0f, BLACK);
+ DrawTextEx(sans_font, page_data.value(), { 0, 20 + scroll_pos * 40 }, 24, 1.0f, BLACK);
}
EndDrawing();