diff --git a/docs/html_grammar.txt b/docs/html_grammar.txt
index 412109a..388bfd3 100644
--- a/docs/html_grammar.txt
+++ b/docs/html_grammar.txt
@@ -10,6 +10,6 @@ Defined Tokens
::=
| "<" * ">" ( | )* "<" "/" ">"
| "<" * "/" ">"
- ::= "="
+ ::= "=" ( | )
::= ( | | | | | | )+
diff --git a/src/html_parse.cpp b/src/html_parse.cpp
index f3c942f..a266084 100644
--- a/src/html_parse.cpp
+++ b/src/html_parse.cpp
@@ -50,7 +50,7 @@ std::vector Tokenizer::tokenize()
if (is_alpha(*it)) {
do {
utf8::append(*it, std::back_inserter(buffer));
- } while (++it != end && is_alpha_num(*it));
+ } while (++it != end && (is_alpha_num(*it) || *it == '-'));
--it;
tokens.push_back({ .type = TokenType::ident, .value = buffer });
buffer.clear();
@@ -190,7 +190,14 @@ Parser::Parser(std::vector tokens)
NodeDoc Parser::parse()
{
- return {};
+ NodeDoc doc;
+ if (auto doc_type = parse_doc_type()) {
+ doc.doc_type = doc_type.value();
+ }
+ while (auto elem = parse_elem()) {
+ doc.children.push_back(elem.value());
+ }
+ return doc;
}
std::optional> Parser::peek(size_t ahead)
@@ -220,30 +227,20 @@ bool is_ci_equal(const std::string& s1, const std::string& s2)
std::optional Parser::parse_doc_type()
{
- if (!peek().has_value()) {
- return {};
+ if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::exclaim)
+ && peek_is(2, TokenType::ident, "doctype", StrCmp::case_insensitive) && peek_is_with_val(3, TokenType::ident)
+ && peek_is(4, TokenType::gt)) {
+ consume();
+ consume();
+ consume();
+ NodeDocType doc_type;
+ doc_type.type = consume().value.value();
+ consume();
+ return doc_type;
}
- Token& ahead1 = peek().value().get();
- if (ahead1.type != TokenType::lt) {
- return {};
- }
- if (!peek(2).has_value()) {
- return {};
- }
- Token& ahead2 = peek(2).value().get();
- if (ahead2.type != TokenType::ident || !ahead2.value.has_value() || !is_ci_equal(ahead2.value.value(), "doctype")) {
- return {};
- }
- consume();
- consume();
- Token& ahead3 = peek(3).value().get();
- if (ahead3.type != TokenType::ident) {
- std::cerr << "[ERROR] Expected identifier" << std::endl;
- exit(EXIT_FAILURE);
- }
- NodeDocType doc_type;
- doc_type.type = consume().value.value();
+ return {};
}
+
bool Parser::peek_is(size_t ahead, TokenType type)
{
if (!peek(ahead).has_value()) {
@@ -259,6 +256,7 @@ bool Parser::peek_is_with_val(size_t ahead, TokenType type)
}
return peek(ahead).value().get().value.has_value();
}
+
bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp)
{
if (!peek_is_with_val(ahead, type)) {
@@ -271,5 +269,80 @@ bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parse
return is_ci_equal(peek(ahead).value().get().value.value(), val);
}
}
+std::optional Parser::parse_elem()
+{
+ if (peek_is(0, TokenType::lt) && peek_is_with_val(1, TokenType::ident)) {
+ consume();
+ std::string tag = consume().value.value();
+ std::vector attributes;
+ while (auto attr = parse_attr()) {
+ attributes.push_back(attr.value());
+ }
+ if (peek_is(0, TokenType::gt)) {
+ consume();
+ NodeElemReg elem_reg;
+ elem_reg.tag = tag;
+ elem_reg.attributes = std::move(attributes);
+ while (peek().has_value()
+ && !(
+ peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash)
+ && peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt))) {
+ if (auto child = parse_elem()) {
+ elem_reg.inner.push_back(child.value());
+ }
+ else if (peek().has_value()) {
+ elem_reg.inner.push_back(consume().to_string());
+ }
+ else {
+ assert(false && "Unexpected");
+ }
+ }
+ if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash)
+ && peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt)) {
+ consume();
+ consume();
+ consume();
+ consume();
+ NodeElem elem;
+ elem.var = std::move(elem_reg);
+ return elem;
+ }
+ else {
+ assert(false && "Unexpected");
+ }
+ }
+ else if (peek_is(0, TokenType::fslash)) {
+ NodeElemSelfClose elem_close;
+ elem_close.tag = tag;
+ elem_close.attributes = std::move(attributes);
+ consume();
+ if (!peek_is(0, TokenType::gt)) {
+ assert(false && "Unexpected");
+ }
+ consume();
+ NodeElem elem;
+ elem.var = std::move(elem_close);
+ return elem;
+ }
+ else {
+ assert(false && "Unexpected");
+ }
+ }
+ return {};
+}
+std::optional Parser::parse_attr()
+{
+ if (peek_is_with_val(0, TokenType::ident) && peek_is(1, TokenType::eq)
+ && (peek_is_with_val(2, TokenType::ident) || peek_is_with_val(2, TokenType::str))) {
+ NodeAttr attr;
+ attr.key = consume().value.value();
+ consume();
+ attr.val = consume().value.value();
+ return attr;
+ }
+ else {
+ return {};
+ }
+}
}
\ No newline at end of file
diff --git a/src/html_parse.hpp b/src/html_parse.hpp
index be9b838..ae2cdca 100644
--- a/src/html_parse.hpp
+++ b/src/html_parse.hpp
@@ -58,16 +58,12 @@ struct NodeAttr {
std::string val;
};
-struct NodeInner {
- std::string value;
-};
-
struct NodeElem;
struct NodeElemReg {
std::string tag;
std::vector attributes;
- std::vector> inner;
+ std::vector> inner;
};
struct NodeElemSelfClose {
@@ -80,8 +76,8 @@ struct NodeElem {
};
struct NodeDoc {
- std::optional doc_type;
- std::vector children;
+ std::optional doc_type {};
+ std::vector children {};
};
class Parser {
@@ -91,6 +87,8 @@ public:
NodeDoc parse();
std::optional parse_doc_type();
+ std::optional parse_elem();
+ std::optional parse_attr();
private:
std::optional> peek(size_t ahead = 0);
diff --git a/src/main.cpp b/src/main.cpp
index 4b471a9..fd89fc3 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -34,9 +34,12 @@ int main()
if (page_data.has_value()) {
html::Tokenizer tokenizer(page_data.value());
std::vector tokens = tokenizer.tokenize();
- for (html::Token token : tokens) {
- std::cout << token.to_string() << std::endl;
- }
+ // for (html::Token token : tokens) {
+ // std::cout << token.to_string() << std::endl;
+ // }
+ html::Parser parser(std::move(tokens));
+ html::NodeDoc doc = parser.parse();
+ std::cout << "HERE" << std::endl;
}
while (!window.ShouldClose()) {