diff --git a/docs/html_grammar.txt b/docs/html_grammar.txt index 412109a..388bfd3 100644 --- a/docs/html_grammar.txt +++ b/docs/html_grammar.txt @@ -10,6 +10,6 @@ Defined Tokens ::= | "<" * ">" ( | )* "<" "/" ">" | "<" * "/" ">" - ::= "=" + ::= "=" ( | ) ::= ( | | | | | | )+ diff --git a/src/html_parse.cpp b/src/html_parse.cpp index f3c942f..a266084 100644 --- a/src/html_parse.cpp +++ b/src/html_parse.cpp @@ -50,7 +50,7 @@ std::vector Tokenizer::tokenize() if (is_alpha(*it)) { do { utf8::append(*it, std::back_inserter(buffer)); - } while (++it != end && is_alpha_num(*it)); + } while (++it != end && (is_alpha_num(*it) || *it == '-')); --it; tokens.push_back({ .type = TokenType::ident, .value = buffer }); buffer.clear(); @@ -190,7 +190,14 @@ Parser::Parser(std::vector tokens) NodeDoc Parser::parse() { - return {}; + NodeDoc doc; + if (auto doc_type = parse_doc_type()) { + doc.doc_type = doc_type.value(); + } + while (auto elem = parse_elem()) { + doc.children.push_back(elem.value()); + } + return doc; } std::optional> Parser::peek(size_t ahead) @@ -220,30 +227,20 @@ bool is_ci_equal(const std::string& s1, const std::string& s2) std::optional Parser::parse_doc_type() { - if (!peek().has_value()) { - return {}; + if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::exclaim) + && peek_is(2, TokenType::ident, "doctype", StrCmp::case_insensitive) && peek_is_with_val(3, TokenType::ident) + && peek_is(4, TokenType::gt)) { + consume(); + consume(); + consume(); + NodeDocType doc_type; + doc_type.type = consume().value.value(); + consume(); + return doc_type; } - Token& ahead1 = peek().value().get(); - if (ahead1.type != TokenType::lt) { - return {}; - } - if (!peek(2).has_value()) { - return {}; - } - Token& ahead2 = peek(2).value().get(); - if (ahead2.type != TokenType::ident || !ahead2.value.has_value() || !is_ci_equal(ahead2.value.value(), "doctype")) { - return {}; - } - consume(); - consume(); - Token& ahead3 = peek(3).value().get(); - if (ahead3.type != TokenType::ident) { - std::cerr << "[ERROR] Expected identifier" << std::endl; - exit(EXIT_FAILURE); - } - NodeDocType doc_type; - doc_type.type = consume().value.value(); + return {}; } + bool Parser::peek_is(size_t ahead, TokenType type) { if (!peek(ahead).has_value()) { @@ -259,6 +256,7 @@ bool Parser::peek_is_with_val(size_t ahead, TokenType type) } return peek(ahead).value().get().value.has_value(); } + bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp) { if (!peek_is_with_val(ahead, type)) { @@ -271,5 +269,80 @@ bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parse return is_ci_equal(peek(ahead).value().get().value.value(), val); } } +std::optional Parser::parse_elem() +{ + if (peek_is(0, TokenType::lt) && peek_is_with_val(1, TokenType::ident)) { + consume(); + std::string tag = consume().value.value(); + std::vector attributes; + while (auto attr = parse_attr()) { + attributes.push_back(attr.value()); + } + if (peek_is(0, TokenType::gt)) { + consume(); + NodeElemReg elem_reg; + elem_reg.tag = tag; + elem_reg.attributes = std::move(attributes); + while (peek().has_value() + && !( + peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash) + && peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt))) { + if (auto child = parse_elem()) { + elem_reg.inner.push_back(child.value()); + } + else if (peek().has_value()) { + elem_reg.inner.push_back(consume().to_string()); + } + else { + assert(false && "Unexpected"); + } + } + if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash) + && peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt)) { + consume(); + consume(); + consume(); + consume(); + NodeElem elem; + elem.var = std::move(elem_reg); + return elem; + } + else { + assert(false && "Unexpected"); + } + } + else if (peek_is(0, TokenType::fslash)) { + NodeElemSelfClose elem_close; + elem_close.tag = tag; + elem_close.attributes = std::move(attributes); + consume(); + if (!peek_is(0, TokenType::gt)) { + assert(false && "Unexpected"); + } + consume(); + NodeElem elem; + elem.var = std::move(elem_close); + return elem; + } + else { + assert(false && "Unexpected"); + } + } + return {}; +} +std::optional Parser::parse_attr() +{ + if (peek_is_with_val(0, TokenType::ident) && peek_is(1, TokenType::eq) + && (peek_is_with_val(2, TokenType::ident) || peek_is_with_val(2, TokenType::str))) { + NodeAttr attr; + attr.key = consume().value.value(); + consume(); + attr.val = consume().value.value(); + return attr; + } + else { + return {}; + } +} } \ No newline at end of file diff --git a/src/html_parse.hpp b/src/html_parse.hpp index be9b838..ae2cdca 100644 --- a/src/html_parse.hpp +++ b/src/html_parse.hpp @@ -58,16 +58,12 @@ struct NodeAttr { std::string val; }; -struct NodeInner { - std::string value; -}; - struct NodeElem; struct NodeElemReg { std::string tag; std::vector attributes; - std::vector> inner; + std::vector> inner; }; struct NodeElemSelfClose { @@ -80,8 +76,8 @@ struct NodeElem { }; struct NodeDoc { - std::optional doc_type; - std::vector children; + std::optional doc_type {}; + std::vector children {}; }; class Parser { @@ -91,6 +87,8 @@ public: NodeDoc parse(); std::optional parse_doc_type(); + std::optional parse_elem(); + std::optional parse_attr(); private: std::optional> peek(size_t ahead = 0); diff --git a/src/main.cpp b/src/main.cpp index 4b471a9..fd89fc3 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -34,9 +34,12 @@ int main() if (page_data.has_value()) { html::Tokenizer tokenizer(page_data.value()); std::vector tokens = tokenizer.tokenize(); - for (html::Token token : tokens) { - std::cout << token.to_string() << std::endl; - } + // for (html::Token token : tokens) { + // std::cout << token.to_string() << std::endl; + // } + html::Parser parser(std::move(tokens)); + html::NodeDoc doc = parser.parse(); + std::cout << "HERE" << std::endl; } while (!window.ShouldClose()) {