diff --git a/src/html_parse.cpp b/src/html_parse.cpp
index ad1b4e4..3e2d691 100644
--- a/src/html_parse.cpp
+++ b/src/html_parse.cpp
@@ -40,7 +40,6 @@ static bool is_newline(char32_t c)
std::vector Tokenizer::tokenize()
{
- std::string buffer;
std::vector tokens;
utf8::iterator it(m_source.begin(), m_source.begin(), m_source.end());
@@ -48,84 +47,77 @@ std::vector Tokenizer::tokenize()
for (; it != end; ++it) {
if (is_alpha(*it)) {
- do {
- utf8::append(*it, std::back_inserter(buffer));
- } while (++it != end && (is_alpha_num(*it) || *it == '-'));
+ auto begin = it;
+ while (++it != end && (is_alpha_num(*it) || *it == '-')) { }
+ Token token { .type = TokenType::ident, .view { begin, it } };
--it;
- tokens.push_back({ .type = TokenType::ident, .value = buffer });
- buffer.clear();
}
else if (*it == '<') {
- tokens.push_back({ .type = TokenType::lt });
+ tokens.push_back({ .type = TokenType::lt, .view { it } });
}
else if (*it == '>') {
- tokens.push_back({ .type = TokenType::gt });
+ tokens.push_back({ .type = TokenType::gt, .view { it } });
}
else if (*it == '!') {
- tokens.push_back({ .type = TokenType::exclaim });
+ tokens.push_back({ .type = TokenType::exclaim, .view { it } });
}
else if (*it == '/') {
- tokens.push_back({ .type = TokenType::fslash });
+ tokens.push_back({ .type = TokenType::fslash, .view { it } });
}
else if (*it == '=') {
- tokens.push_back({ .type = TokenType::eq });
+ tokens.push_back({ .type = TokenType::eq, .view { it } });
}
else if (*it == '-') {
- tokens.push_back({ .type = TokenType::minus });
+ tokens.push_back({ .type = TokenType::minus, .view { it } });
}
else if (*it == ':') {
- tokens.push_back({ .type = TokenType::colon });
+ tokens.push_back({ .type = TokenType::colon, .view { it } });
}
else if (*it == '#') {
- tokens.push_back({ .type = TokenType::hash });
+ tokens.push_back({ .type = TokenType::hash, .view { it } });
}
else if (*it == ';') {
- tokens.push_back({ .type = TokenType::semi });
+ tokens.push_back({ .type = TokenType::semi, .view { it } });
}
else if (is_num(*it)) {
- do {
- utf8::append(*it, std::back_inserter(buffer));
- } while (++it != end && is_num(*it));
+ auto begin = it;
+ while (++it != end && is_num(*it)) { };
+ tokens.push_back({ .type = TokenType::num, .view { begin, it } });
--it;
- tokens.push_back({ .type = TokenType::num, .value = buffer });
- buffer.clear();
}
else if (*it == ',') {
- tokens.push_back({ .type = TokenType::comma });
+ tokens.push_back({ .type = TokenType::comma, .view { it } });
}
else if (*it == '{') {
- tokens.push_back({ .type = TokenType::left_curly });
+ tokens.push_back({ .type = TokenType::left_curly, .view { it } });
}
else if (*it == '}') {
- tokens.push_back({ .type = TokenType::right_curly });
+ tokens.push_back({ .type = TokenType::right_curly, .view { it } });
}
else if (*it == '(') {
- tokens.push_back({ .type = TokenType::left_paren });
+ tokens.push_back({ .type = TokenType::left_paren, .view { it } });
}
else if (*it == ')') {
- tokens.push_back({ .type = TokenType::right_paren });
+ tokens.push_back({ .type = TokenType::right_paren, .view { it } });
}
else if (*it == '.') {
- tokens.push_back({ .type = TokenType::dot });
+ tokens.push_back({ .type = TokenType::dot, .view { it } });
}
else if (*it == '@') {
- tokens.push_back({ .type = TokenType::at });
+ tokens.push_back({ .type = TokenType::at, .view { it } });
}
else if (*it == '"') {
+ auto begin = it;
++it;
while (it != end && *it != '"') {
- utf8::append(*it, std::back_inserter(buffer));
++it;
}
- tokens.push_back({ .type = TokenType::str, .value = buffer });
- buffer.clear();
+ tokens.push_back({ .type = TokenType::str, .view { begin, it } });
}
else if (is_space(*it) || is_newline(*it)) {
}
else {
- utf8::append(*it, std::back_inserter(buffer));
- std::cout << "[WARN] Unexpected token: " << buffer << std::endl;
- buffer.clear();
+ std::cout << "[WARN] Unexpected token" << std::endl;
}
}
@@ -140,13 +132,11 @@ std::string Token::to_string()
case TokenType::gt:
return ">";
case TokenType::ident:
- assert(value.has_value());
- return value.value();
+ return "IDENT"; // TODO
case TokenType::fslash:
return "/";
case TokenType::str:
- assert(value.has_value());
- return value.value();
+ return "STR"; // TODO
case TokenType::exclaim:
return "!";
case TokenType::eq:
@@ -174,8 +164,7 @@ std::string Token::to_string()
case TokenType::semi:
return ";";
case TokenType::num:
- assert(value.has_value());
- return value.value();
+ return "NUM"; // TODO
default:
assert(false && "Unimplemented");
return "";
@@ -238,8 +227,7 @@ std::optional Parser::parse_doc_type()
consume();
consume();
consume();
- NodeDocType doc_type;
- doc_type.type = consume().value.value();
+ NodeDocType doc_type { .type = consume().view };
consume();
return doc_type;
}
@@ -254,17 +242,9 @@ bool Parser::peek_is(size_t ahead, TokenType type)
return peek(ahead).value().get().type == type;
}
-bool Parser::peek_is_with_val(size_t ahead, TokenType type)
-{
- if (!peek_is(ahead, type)) {
- return false;
- }
- return peek(ahead).value().get().value.has_value();
-}
-
bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp)
{
- if (!peek_is_with_val(ahead, type)) {
+ if (!peek_is(ahead, type)) {
return false;
}
switch (cmp) {
@@ -350,4 +330,24 @@ std::optional Parser::parse_attr()
}
}
-}
\ No newline at end of file
+}
+
+Utf8View::Utf8View(Utf8View::Iterator begin)
+ : m_begin(begin)
+ , m_end(Iterator(begin.base(), begin.base(), begin.base() + 1))
+{
+}
+
+Utf8View::Utf8View(Utf8View::Iterator begin, Utf8View::Iterator end)
+ : m_begin(begin)
+ , m_end(end)
+{
+}
+Utf8View::Iterator Utf8View::begin() const
+{
+ return m_begin;
+}
+Utf8View::Iterator Utf8View::end() const
+{
+ return m_end;
+}
diff --git a/src/html_parse.hpp b/src/html_parse.hpp
index eb271d1..649de6c 100644
--- a/src/html_parse.hpp
+++ b/src/html_parse.hpp
@@ -8,6 +8,21 @@
#include
+class Utf8View {
+public:
+ using Iterator = utf8::iterator;
+
+ explicit Utf8View(Iterator begin);
+ Utf8View(Iterator begin, Iterator end);
+
+ [[nodiscard]] Iterator begin() const;
+ [[nodiscard]] Iterator end() const;
+
+private:
+ Iterator m_begin;
+ Iterator m_end;
+};
+
namespace html {
enum class TokenType {
@@ -34,7 +49,7 @@ enum class TokenType {
struct Token {
TokenType type;
- std::optional value = {};
+ Utf8View view;
std::string to_string();
};
@@ -50,7 +65,7 @@ private:
};
struct NodeDocType {
- std::string type;
+ Utf8View type;
};
struct NodeAttr {
@@ -96,7 +111,6 @@ private:
enum class StrCmp { case_sensitive, case_insensitive };
bool peek_is(size_t ahead, TokenType type);
- bool peek_is_with_val(size_t ahead, TokenType type);
bool peek_is(size_t ahead, TokenType type, const std::string& val, StrCmp cmp);
Token& consume();