diff --git a/docs/html_grammar.txt b/docs/html_grammar.txt index 5e2f889..412109a 100644 --- a/docs/html_grammar.txt +++ b/docs/html_grammar.txt @@ -5,7 +5,7 @@ Defined Tokens --- - ::= [] + ::= [] * ::= "<" "!" "doctype" "html" ">" ::= | "<" * ">" ( | )* "<" "/" ">" diff --git a/src/html_parse.cpp b/src/html_parse.cpp index ea0abf1..f3c942f 100644 --- a/src/html_parse.cpp +++ b/src/html_parse.cpp @@ -182,4 +182,94 @@ std::string Token::to_string() } } +Parser::Parser(std::vector tokens) + : m_tokens(std::move(tokens)) + , m_index(0) +{ +} + +NodeDoc Parser::parse() +{ + return {}; +} + +std::optional> Parser::peek(size_t ahead) +{ + if (m_index + ahead >= m_tokens.size()) { + return {}; + } + return m_tokens.at(m_index + ahead); +} +Token& Parser::consume() +{ + return m_tokens.at(m_index++); +} + +bool is_ci_equal(const std::string& s1, const std::string& s2) +{ + if (s1.size() != s2.size()) { + return false; + } + for (size_t i = 0; i < s1.size(); i++) { + if (std::tolower(s1.at(i) != std::tolower(s2.at(i)))) { + return false; + } + } + return true; +} + +std::optional Parser::parse_doc_type() +{ + if (!peek().has_value()) { + return {}; + } + Token& ahead1 = peek().value().get(); + if (ahead1.type != TokenType::lt) { + return {}; + } + if (!peek(2).has_value()) { + return {}; + } + Token& ahead2 = peek(2).value().get(); + if (ahead2.type != TokenType::ident || !ahead2.value.has_value() || !is_ci_equal(ahead2.value.value(), "doctype")) { + return {}; + } + consume(); + consume(); + Token& ahead3 = peek(3).value().get(); + if (ahead3.type != TokenType::ident) { + std::cerr << "[ERROR] Expected identifier" << std::endl; + exit(EXIT_FAILURE); + } + NodeDocType doc_type; + doc_type.type = consume().value.value(); +} +bool Parser::peek_is(size_t ahead, TokenType type) +{ + if (!peek(ahead).has_value()) { + return {}; + } + return peek(ahead).value().get().type == type; +} + +bool Parser::peek_is_with_val(size_t ahead, TokenType type) +{ + if (!peek_is(ahead, type)) { + return false; + } + return peek(ahead).value().get().value.has_value(); +} +bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp) +{ + if (!peek_is_with_val(ahead, type)) { + return false; + } + switch (cmp) { + case StrCmp::case_sensitive: + return peek(ahead).value().get().value.value() == val; + case StrCmp::case_insensitive: + return is_ci_equal(peek(ahead).value().get().value.value(), val); + } +} + } \ No newline at end of file diff --git a/src/html_parse.hpp b/src/html_parse.hpp index d8e1915..be9b838 100644 --- a/src/html_parse.hpp +++ b/src/html_parse.hpp @@ -1,7 +1,9 @@ #pragma once +#include #include #include +#include #include #include @@ -37,12 +39,6 @@ struct Token { std::string to_string(); }; -enum class NodeType { - -}; - -struct Node { }; - class Tokenizer { public: explicit Tokenizer(std::string source); @@ -53,4 +49,62 @@ private: std::string m_source; }; -} \ No newline at end of file +struct NodeDocType { + std::string type; +}; + +struct NodeAttr { + std::string key; + std::string val; +}; + +struct NodeInner { + std::string value; +}; + +struct NodeElem; + +struct NodeElemReg { + std::string tag; + std::vector attributes; + std::vector> inner; +}; + +struct NodeElemSelfClose { + std::string tag; + std::vector attributes; +}; + +struct NodeElem { + std::variant var; +}; + +struct NodeDoc { + std::optional doc_type; + std::vector children; +}; + +class Parser { +public: + explicit Parser(std::vector tokens); + + NodeDoc parse(); + + std::optional parse_doc_type(); + +private: + std::optional> peek(size_t ahead = 0); + + enum class StrCmp { case_sensitive, case_insensitive }; + + bool peek_is(size_t ahead, TokenType type); + bool peek_is_with_val(size_t ahead, TokenType type); + bool peek_is(size_t ahead, TokenType type, const std::string& val, StrCmp cmp); + + Token& consume(); + + std::vector m_tokens; + size_t m_index; +}; + +}