Initial parsing success

This commit is contained in:
orosmatthew 2023-09-29 19:52:17 -04:00
parent 8d13ea0fb8
commit 4e0cc367e6
4 changed files with 109 additions and 35 deletions

View File

@ -10,6 +10,6 @@ Defined Tokens
<elem> ::= <elem> ::=
| "<" <ident> <attr>* ">" (<elem> | <inner>)* "<" "/" <ident> ">" | "<" <ident> <attr>* ">" (<elem> | <inner>)* "<" "/" <ident> ">"
| "<" <ident> <attr>* "/" ">" | "<" <ident> <attr>* "/" ">"
<attr> ::= <ident> "=" <str> <attr> ::= <ident> "=" (<str> | <ident>)
<inner> ::= (<ident> | <dot> | <at> | <hash> | <num> | <comma> | <semi>)+ <inner> ::= (<ident> | <dot> | <at> | <hash> | <num> | <comma> | <semi>)+

View File

@ -50,7 +50,7 @@ std::vector<Token> Tokenizer::tokenize()
if (is_alpha(*it)) { if (is_alpha(*it)) {
do { do {
utf8::append(*it, std::back_inserter(buffer)); utf8::append(*it, std::back_inserter(buffer));
} while (++it != end && is_alpha_num(*it)); } while (++it != end && (is_alpha_num(*it) || *it == '-'));
--it; --it;
tokens.push_back({ .type = TokenType::ident, .value = buffer }); tokens.push_back({ .type = TokenType::ident, .value = buffer });
buffer.clear(); buffer.clear();
@ -190,7 +190,14 @@ Parser::Parser(std::vector<Token> tokens)
NodeDoc Parser::parse() NodeDoc Parser::parse()
{ {
return {}; NodeDoc doc;
if (auto doc_type = parse_doc_type()) {
doc.doc_type = doc_type.value();
}
while (auto elem = parse_elem()) {
doc.children.push_back(elem.value());
}
return doc;
} }
std::optional<std::reference_wrapper<Token>> Parser::peek(size_t ahead) std::optional<std::reference_wrapper<Token>> Parser::peek(size_t ahead)
@ -220,30 +227,20 @@ bool is_ci_equal(const std::string& s1, const std::string& s2)
std::optional<NodeDocType> Parser::parse_doc_type() std::optional<NodeDocType> Parser::parse_doc_type()
{ {
if (!peek().has_value()) { if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::exclaim)
return {}; && peek_is(2, TokenType::ident, "doctype", StrCmp::case_insensitive) && peek_is_with_val(3, TokenType::ident)
} && peek_is(4, TokenType::gt)) {
Token& ahead1 = peek().value().get(); consume();
if (ahead1.type != TokenType::lt) {
return {};
}
if (!peek(2).has_value()) {
return {};
}
Token& ahead2 = peek(2).value().get();
if (ahead2.type != TokenType::ident || !ahead2.value.has_value() || !is_ci_equal(ahead2.value.value(), "doctype")) {
return {};
}
consume(); consume();
consume(); consume();
Token& ahead3 = peek(3).value().get();
if (ahead3.type != TokenType::ident) {
std::cerr << "[ERROR] Expected identifier" << std::endl;
exit(EXIT_FAILURE);
}
NodeDocType doc_type; NodeDocType doc_type;
doc_type.type = consume().value.value(); doc_type.type = consume().value.value();
consume();
return doc_type;
} }
return {};
}
bool Parser::peek_is(size_t ahead, TokenType type) bool Parser::peek_is(size_t ahead, TokenType type)
{ {
if (!peek(ahead).has_value()) { if (!peek(ahead).has_value()) {
@ -259,6 +256,7 @@ bool Parser::peek_is_with_val(size_t ahead, TokenType type)
} }
return peek(ahead).value().get().value.has_value(); return peek(ahead).value().get().value.has_value();
} }
bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp) bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp)
{ {
if (!peek_is_with_val(ahead, type)) { if (!peek_is_with_val(ahead, type)) {
@ -271,5 +269,80 @@ bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parse
return is_ci_equal(peek(ahead).value().get().value.value(), val); return is_ci_equal(peek(ahead).value().get().value.value(), val);
} }
} }
std::optional<NodeElem> Parser::parse_elem()
{
if (peek_is(0, TokenType::lt) && peek_is_with_val(1, TokenType::ident)) {
consume();
std::string tag = consume().value.value();
std::vector<NodeAttr> attributes;
while (auto attr = parse_attr()) {
attributes.push_back(attr.value());
}
if (peek_is(0, TokenType::gt)) {
consume();
NodeElemReg elem_reg;
elem_reg.tag = tag;
elem_reg.attributes = std::move(attributes);
while (peek().has_value()
&& !(
peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash)
&& peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt))) {
if (auto child = parse_elem()) {
elem_reg.inner.push_back(child.value());
}
else if (peek().has_value()) {
elem_reg.inner.push_back(consume().to_string());
}
else {
assert(false && "Unexpected");
}
}
if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash)
&& peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt)) {
consume();
consume();
consume();
consume();
NodeElem elem;
elem.var = std::move(elem_reg);
return elem;
}
else {
assert(false && "Unexpected");
}
}
else if (peek_is(0, TokenType::fslash)) {
NodeElemSelfClose elem_close;
elem_close.tag = tag;
elem_close.attributes = std::move(attributes);
consume();
if (!peek_is(0, TokenType::gt)) {
assert(false && "Unexpected");
}
consume();
NodeElem elem;
elem.var = std::move(elem_close);
return elem;
}
else {
assert(false && "Unexpected");
}
}
return {};
}
std::optional<NodeAttr> Parser::parse_attr()
{
if (peek_is_with_val(0, TokenType::ident) && peek_is(1, TokenType::eq)
&& (peek_is_with_val(2, TokenType::ident) || peek_is_with_val(2, TokenType::str))) {
NodeAttr attr;
attr.key = consume().value.value();
consume();
attr.val = consume().value.value();
return attr;
}
else {
return {};
}
}
} }

View File

@ -58,16 +58,12 @@ struct NodeAttr {
std::string val; std::string val;
}; };
struct NodeInner {
std::string value;
};
struct NodeElem; struct NodeElem;
struct NodeElemReg { struct NodeElemReg {
std::string tag; std::string tag;
std::vector<NodeAttr> attributes; std::vector<NodeAttr> attributes;
std::vector<std::variant<NodeElem, NodeInner>> inner; std::vector<std::variant<NodeElem, std::string>> inner;
}; };
struct NodeElemSelfClose { struct NodeElemSelfClose {
@ -80,8 +76,8 @@ struct NodeElem {
}; };
struct NodeDoc { struct NodeDoc {
std::optional<NodeDocType> doc_type; std::optional<NodeDocType> doc_type {};
std::vector<NodeElem> children; std::vector<NodeElem> children {};
}; };
class Parser { class Parser {
@ -91,6 +87,8 @@ public:
NodeDoc parse(); NodeDoc parse();
std::optional<NodeDocType> parse_doc_type(); std::optional<NodeDocType> parse_doc_type();
std::optional<NodeElem> parse_elem();
std::optional<NodeAttr> parse_attr();
private: private:
std::optional<std::reference_wrapper<Token>> peek(size_t ahead = 0); std::optional<std::reference_wrapper<Token>> peek(size_t ahead = 0);

View File

@ -34,9 +34,12 @@ int main()
if (page_data.has_value()) { if (page_data.has_value()) {
html::Tokenizer tokenizer(page_data.value()); html::Tokenizer tokenizer(page_data.value());
std::vector<html::Token> tokens = tokenizer.tokenize(); std::vector<html::Token> tokens = tokenizer.tokenize();
for (html::Token token : tokens) { // for (html::Token token : tokens) {
std::cout << token.to_string() << std::endl; // std::cout << token.to_string() << std::endl;
} // }
html::Parser parser(std::move(tokens));
html::NodeDoc doc = parser.parse();
std::cout << "HERE" << std::endl;
} }
while (!window.ShouldClose()) { while (!window.ShouldClose()) {