Initial utf8view

This commit is contained in:
orosmatthew 2023-10-04 14:59:32 -04:00
parent b537916c6b
commit 3407b86171
2 changed files with 68 additions and 54 deletions

View File

@ -40,7 +40,6 @@ static bool is_newline(char32_t c)
std::vector<Token> Tokenizer::tokenize() std::vector<Token> Tokenizer::tokenize()
{ {
std::string buffer;
std::vector<Token> tokens; std::vector<Token> tokens;
utf8::iterator<std::string::iterator> it(m_source.begin(), m_source.begin(), m_source.end()); utf8::iterator<std::string::iterator> it(m_source.begin(), m_source.begin(), m_source.end());
@ -48,84 +47,77 @@ std::vector<Token> Tokenizer::tokenize()
for (; it != end; ++it) { for (; it != end; ++it) {
if (is_alpha(*it)) { if (is_alpha(*it)) {
do { auto begin = it;
utf8::append(*it, std::back_inserter(buffer)); while (++it != end && (is_alpha_num(*it) || *it == '-')) { }
} while (++it != end && (is_alpha_num(*it) || *it == '-')); Token token { .type = TokenType::ident, .view { begin, it } };
--it; --it;
tokens.push_back({ .type = TokenType::ident, .value = buffer });
buffer.clear();
} }
else if (*it == '<') { else if (*it == '<') {
tokens.push_back({ .type = TokenType::lt }); tokens.push_back({ .type = TokenType::lt, .view { it } });
} }
else if (*it == '>') { else if (*it == '>') {
tokens.push_back({ .type = TokenType::gt }); tokens.push_back({ .type = TokenType::gt, .view { it } });
} }
else if (*it == '!') { else if (*it == '!') {
tokens.push_back({ .type = TokenType::exclaim }); tokens.push_back({ .type = TokenType::exclaim, .view { it } });
} }
else if (*it == '/') { else if (*it == '/') {
tokens.push_back({ .type = TokenType::fslash }); tokens.push_back({ .type = TokenType::fslash, .view { it } });
} }
else if (*it == '=') { else if (*it == '=') {
tokens.push_back({ .type = TokenType::eq }); tokens.push_back({ .type = TokenType::eq, .view { it } });
} }
else if (*it == '-') { else if (*it == '-') {
tokens.push_back({ .type = TokenType::minus }); tokens.push_back({ .type = TokenType::minus, .view { it } });
} }
else if (*it == ':') { else if (*it == ':') {
tokens.push_back({ .type = TokenType::colon }); tokens.push_back({ .type = TokenType::colon, .view { it } });
} }
else if (*it == '#') { else if (*it == '#') {
tokens.push_back({ .type = TokenType::hash }); tokens.push_back({ .type = TokenType::hash, .view { it } });
} }
else if (*it == ';') { else if (*it == ';') {
tokens.push_back({ .type = TokenType::semi }); tokens.push_back({ .type = TokenType::semi, .view { it } });
} }
else if (is_num(*it)) { else if (is_num(*it)) {
do { auto begin = it;
utf8::append(*it, std::back_inserter(buffer)); while (++it != end && is_num(*it)) { };
} while (++it != end && is_num(*it)); tokens.push_back({ .type = TokenType::num, .view { begin, it } });
--it; --it;
tokens.push_back({ .type = TokenType::num, .value = buffer });
buffer.clear();
} }
else if (*it == ',') { else if (*it == ',') {
tokens.push_back({ .type = TokenType::comma }); tokens.push_back({ .type = TokenType::comma, .view { it } });
} }
else if (*it == '{') { else if (*it == '{') {
tokens.push_back({ .type = TokenType::left_curly }); tokens.push_back({ .type = TokenType::left_curly, .view { it } });
} }
else if (*it == '}') { else if (*it == '}') {
tokens.push_back({ .type = TokenType::right_curly }); tokens.push_back({ .type = TokenType::right_curly, .view { it } });
} }
else if (*it == '(') { else if (*it == '(') {
tokens.push_back({ .type = TokenType::left_paren }); tokens.push_back({ .type = TokenType::left_paren, .view { it } });
} }
else if (*it == ')') { else if (*it == ')') {
tokens.push_back({ .type = TokenType::right_paren }); tokens.push_back({ .type = TokenType::right_paren, .view { it } });
} }
else if (*it == '.') { else if (*it == '.') {
tokens.push_back({ .type = TokenType::dot }); tokens.push_back({ .type = TokenType::dot, .view { it } });
} }
else if (*it == '@') { else if (*it == '@') {
tokens.push_back({ .type = TokenType::at }); tokens.push_back({ .type = TokenType::at, .view { it } });
} }
else if (*it == '"') { else if (*it == '"') {
auto begin = it;
++it; ++it;
while (it != end && *it != '"') { while (it != end && *it != '"') {
utf8::append(*it, std::back_inserter(buffer));
++it; ++it;
} }
tokens.push_back({ .type = TokenType::str, .value = buffer }); tokens.push_back({ .type = TokenType::str, .view { begin, it } });
buffer.clear();
} }
else if (is_space(*it) || is_newline(*it)) { else if (is_space(*it) || is_newline(*it)) {
} }
else { else {
utf8::append(*it, std::back_inserter(buffer)); std::cout << "[WARN] Unexpected token" << std::endl;
std::cout << "[WARN] Unexpected token: " << buffer << std::endl;
buffer.clear();
} }
} }
@ -140,13 +132,11 @@ std::string Token::to_string()
case TokenType::gt: case TokenType::gt:
return ">"; return ">";
case TokenType::ident: case TokenType::ident:
assert(value.has_value()); return "IDENT"; // TODO
return value.value();
case TokenType::fslash: case TokenType::fslash:
return "/"; return "/";
case TokenType::str: case TokenType::str:
assert(value.has_value()); return "STR"; // TODO
return value.value();
case TokenType::exclaim: case TokenType::exclaim:
return "!"; return "!";
case TokenType::eq: case TokenType::eq:
@ -174,8 +164,7 @@ std::string Token::to_string()
case TokenType::semi: case TokenType::semi:
return ";"; return ";";
case TokenType::num: case TokenType::num:
assert(value.has_value()); return "NUM"; // TODO
return value.value();
default: default:
assert(false && "Unimplemented"); assert(false && "Unimplemented");
return ""; return "";
@ -238,8 +227,7 @@ std::optional<NodeDocType> Parser::parse_doc_type()
consume(); consume();
consume(); consume();
consume(); consume();
NodeDocType doc_type; NodeDocType doc_type { .type = consume().view };
doc_type.type = consume().value.value();
consume(); consume();
return doc_type; return doc_type;
} }
@ -254,17 +242,9 @@ bool Parser::peek_is(size_t ahead, TokenType type)
return peek(ahead).value().get().type == type; return peek(ahead).value().get().type == type;
} }
bool Parser::peek_is_with_val(size_t ahead, TokenType type)
{
if (!peek_is(ahead, type)) {
return false;
}
return peek(ahead).value().get().value.has_value();
}
bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp) bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp)
{ {
if (!peek_is_with_val(ahead, type)) { if (!peek_is(ahead, type)) {
return false; return false;
} }
switch (cmp) { switch (cmp) {
@ -350,4 +330,24 @@ std::optional<NodeAttr> Parser::parse_attr()
} }
} }
} }
Utf8View::Utf8View(Utf8View::Iterator begin)
: m_begin(begin)
, m_end(Iterator(begin.base(), begin.base(), begin.base() + 1))
{
}
Utf8View::Utf8View(Utf8View::Iterator begin, Utf8View::Iterator end)
: m_begin(begin)
, m_end(end)
{
}
Utf8View::Iterator Utf8View::begin() const
{
return m_begin;
}
Utf8View::Iterator Utf8View::end() const
{
return m_end;
}

View File

@ -8,6 +8,21 @@
#include <utf8.h> #include <utf8.h>
class Utf8View {
public:
using Iterator = utf8::iterator<std::string::iterator>;
explicit Utf8View(Iterator begin);
Utf8View(Iterator begin, Iterator end);
[[nodiscard]] Iterator begin() const;
[[nodiscard]] Iterator end() const;
private:
Iterator m_begin;
Iterator m_end;
};
namespace html { namespace html {
enum class TokenType { enum class TokenType {
@ -34,7 +49,7 @@ enum class TokenType {
struct Token { struct Token {
TokenType type; TokenType type;
std::optional<std::string> value = {}; Utf8View view;
std::string to_string(); std::string to_string();
}; };
@ -50,7 +65,7 @@ private:
}; };
struct NodeDocType { struct NodeDocType {
std::string type; Utf8View type;
}; };
struct NodeAttr { struct NodeAttr {
@ -96,7 +111,6 @@ private:
enum class StrCmp { case_sensitive, case_insensitive }; enum class StrCmp { case_sensitive, case_insensitive };
bool peek_is(size_t ahead, TokenType type); bool peek_is(size_t ahead, TokenType type);
bool peek_is_with_val(size_t ahead, TokenType type);
bool peek_is(size_t ahead, TokenType type, const std::string& val, StrCmp cmp); bool peek_is(size_t ahead, TokenType type, const std::string& val, StrCmp cmp);
Token& consume(); Token& consume();