Initial parsing success
This commit is contained in:
parent
8d13ea0fb8
commit
4e0cc367e6
@ -10,6 +10,6 @@ Defined Tokens
|
||||
<elem> ::=
|
||||
| "<" <ident> <attr>* ">" (<elem> | <inner>)* "<" "/" <ident> ">"
|
||||
| "<" <ident> <attr>* "/" ">"
|
||||
<attr> ::= <ident> "=" <str>
|
||||
<attr> ::= <ident> "=" (<str> | <ident>)
|
||||
<inner> ::= (<ident> | <dot> | <at> | <hash> | <num> | <comma> | <semi>)+
|
||||
|
||||
|
@ -50,7 +50,7 @@ std::vector<Token> Tokenizer::tokenize()
|
||||
if (is_alpha(*it)) {
|
||||
do {
|
||||
utf8::append(*it, std::back_inserter(buffer));
|
||||
} while (++it != end && is_alpha_num(*it));
|
||||
} while (++it != end && (is_alpha_num(*it) || *it == '-'));
|
||||
--it;
|
||||
tokens.push_back({ .type = TokenType::ident, .value = buffer });
|
||||
buffer.clear();
|
||||
@ -190,7 +190,14 @@ Parser::Parser(std::vector<Token> tokens)
|
||||
|
||||
NodeDoc Parser::parse()
|
||||
{
|
||||
return {};
|
||||
NodeDoc doc;
|
||||
if (auto doc_type = parse_doc_type()) {
|
||||
doc.doc_type = doc_type.value();
|
||||
}
|
||||
while (auto elem = parse_elem()) {
|
||||
doc.children.push_back(elem.value());
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
std::optional<std::reference_wrapper<Token>> Parser::peek(size_t ahead)
|
||||
@ -220,30 +227,20 @@ bool is_ci_equal(const std::string& s1, const std::string& s2)
|
||||
|
||||
std::optional<NodeDocType> Parser::parse_doc_type()
|
||||
{
|
||||
if (!peek().has_value()) {
|
||||
return {};
|
||||
if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::exclaim)
|
||||
&& peek_is(2, TokenType::ident, "doctype", StrCmp::case_insensitive) && peek_is_with_val(3, TokenType::ident)
|
||||
&& peek_is(4, TokenType::gt)) {
|
||||
consume();
|
||||
consume();
|
||||
consume();
|
||||
NodeDocType doc_type;
|
||||
doc_type.type = consume().value.value();
|
||||
consume();
|
||||
return doc_type;
|
||||
}
|
||||
Token& ahead1 = peek().value().get();
|
||||
if (ahead1.type != TokenType::lt) {
|
||||
return {};
|
||||
}
|
||||
if (!peek(2).has_value()) {
|
||||
return {};
|
||||
}
|
||||
Token& ahead2 = peek(2).value().get();
|
||||
if (ahead2.type != TokenType::ident || !ahead2.value.has_value() || !is_ci_equal(ahead2.value.value(), "doctype")) {
|
||||
return {};
|
||||
}
|
||||
consume();
|
||||
consume();
|
||||
Token& ahead3 = peek(3).value().get();
|
||||
if (ahead3.type != TokenType::ident) {
|
||||
std::cerr << "[ERROR] Expected identifier" << std::endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
NodeDocType doc_type;
|
||||
doc_type.type = consume().value.value();
|
||||
return {};
|
||||
}
|
||||
|
||||
bool Parser::peek_is(size_t ahead, TokenType type)
|
||||
{
|
||||
if (!peek(ahead).has_value()) {
|
||||
@ -259,6 +256,7 @@ bool Parser::peek_is_with_val(size_t ahead, TokenType type)
|
||||
}
|
||||
return peek(ahead).value().get().value.has_value();
|
||||
}
|
||||
|
||||
bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp)
|
||||
{
|
||||
if (!peek_is_with_val(ahead, type)) {
|
||||
@ -271,5 +269,80 @@ bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parse
|
||||
return is_ci_equal(peek(ahead).value().get().value.value(), val);
|
||||
}
|
||||
}
|
||||
std::optional<NodeElem> Parser::parse_elem()
|
||||
{
|
||||
if (peek_is(0, TokenType::lt) && peek_is_with_val(1, TokenType::ident)) {
|
||||
consume();
|
||||
std::string tag = consume().value.value();
|
||||
std::vector<NodeAttr> attributes;
|
||||
while (auto attr = parse_attr()) {
|
||||
attributes.push_back(attr.value());
|
||||
}
|
||||
if (peek_is(0, TokenType::gt)) {
|
||||
consume();
|
||||
NodeElemReg elem_reg;
|
||||
elem_reg.tag = tag;
|
||||
elem_reg.attributes = std::move(attributes);
|
||||
while (peek().has_value()
|
||||
&& !(
|
||||
peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash)
|
||||
&& peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt))) {
|
||||
if (auto child = parse_elem()) {
|
||||
elem_reg.inner.push_back(child.value());
|
||||
}
|
||||
else if (peek().has_value()) {
|
||||
elem_reg.inner.push_back(consume().to_string());
|
||||
}
|
||||
else {
|
||||
assert(false && "Unexpected");
|
||||
}
|
||||
}
|
||||
if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash)
|
||||
&& peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt)) {
|
||||
consume();
|
||||
consume();
|
||||
consume();
|
||||
consume();
|
||||
NodeElem elem;
|
||||
elem.var = std::move(elem_reg);
|
||||
return elem;
|
||||
}
|
||||
else {
|
||||
assert(false && "Unexpected");
|
||||
}
|
||||
}
|
||||
else if (peek_is(0, TokenType::fslash)) {
|
||||
NodeElemSelfClose elem_close;
|
||||
elem_close.tag = tag;
|
||||
elem_close.attributes = std::move(attributes);
|
||||
consume();
|
||||
if (!peek_is(0, TokenType::gt)) {
|
||||
assert(false && "Unexpected");
|
||||
}
|
||||
consume();
|
||||
NodeElem elem;
|
||||
elem.var = std::move(elem_close);
|
||||
return elem;
|
||||
}
|
||||
else {
|
||||
assert(false && "Unexpected");
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
std::optional<NodeAttr> Parser::parse_attr()
|
||||
{
|
||||
if (peek_is_with_val(0, TokenType::ident) && peek_is(1, TokenType::eq)
|
||||
&& (peek_is_with_val(2, TokenType::ident) || peek_is_with_val(2, TokenType::str))) {
|
||||
NodeAttr attr;
|
||||
attr.key = consume().value.value();
|
||||
consume();
|
||||
attr.val = consume().value.value();
|
||||
return attr;
|
||||
}
|
||||
else {
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -58,16 +58,12 @@ struct NodeAttr {
|
||||
std::string val;
|
||||
};
|
||||
|
||||
struct NodeInner {
|
||||
std::string value;
|
||||
};
|
||||
|
||||
struct NodeElem;
|
||||
|
||||
struct NodeElemReg {
|
||||
std::string tag;
|
||||
std::vector<NodeAttr> attributes;
|
||||
std::vector<std::variant<NodeElem, NodeInner>> inner;
|
||||
std::vector<std::variant<NodeElem, std::string>> inner;
|
||||
};
|
||||
|
||||
struct NodeElemSelfClose {
|
||||
@ -80,8 +76,8 @@ struct NodeElem {
|
||||
};
|
||||
|
||||
struct NodeDoc {
|
||||
std::optional<NodeDocType> doc_type;
|
||||
std::vector<NodeElem> children;
|
||||
std::optional<NodeDocType> doc_type {};
|
||||
std::vector<NodeElem> children {};
|
||||
};
|
||||
|
||||
class Parser {
|
||||
@ -91,6 +87,8 @@ public:
|
||||
NodeDoc parse();
|
||||
|
||||
std::optional<NodeDocType> parse_doc_type();
|
||||
std::optional<NodeElem> parse_elem();
|
||||
std::optional<NodeAttr> parse_attr();
|
||||
|
||||
private:
|
||||
std::optional<std::reference_wrapper<Token>> peek(size_t ahead = 0);
|
||||
|
@ -34,9 +34,12 @@ int main()
|
||||
if (page_data.has_value()) {
|
||||
html::Tokenizer tokenizer(page_data.value());
|
||||
std::vector<html::Token> tokens = tokenizer.tokenize();
|
||||
for (html::Token token : tokens) {
|
||||
std::cout << token.to_string() << std::endl;
|
||||
}
|
||||
// for (html::Token token : tokens) {
|
||||
// std::cout << token.to_string() << std::endl;
|
||||
// }
|
||||
html::Parser parser(std::move(tokens));
|
||||
html::NodeDoc doc = parser.parse();
|
||||
std::cout << "HERE" << std::endl;
|
||||
}
|
||||
|
||||
while (!window.ShouldClose()) {
|
||||
|
Loading…
Reference in New Issue
Block a user