diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d266d3..0a1aa3b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,8 @@ add_subdirectory(external/raylib-cpp-4.5.1) add_subdirectory(external/utfcpp-3.2.5) add_executable(browser src/main.cpp src/fetch.cpp - src/html_parse.cpp) + src/html_parse.cpp + src/utf8.hpp) target_include_directories(browser PRIVATE external/raygui-4.0/src) diff --git a/src/html_parse.cpp b/src/html_parse.cpp index 3e2d691..250a387 100644 --- a/src/html_parse.cpp +++ b/src/html_parse.cpp @@ -1,8 +1,8 @@ #include "html_parse.hpp" #include -#include #include +#include #include @@ -49,7 +49,7 @@ std::vector Tokenizer::tokenize() if (is_alpha(*it)) { auto begin = it; while (++it != end && (is_alpha_num(*it) || *it == '-')) { } - Token token { .type = TokenType::ident, .view { begin, it } }; + tokens.push_back({ .type = TokenType::ident, .view { begin, it } }); --it; } else if (*it == '<') { @@ -81,7 +81,7 @@ std::vector Tokenizer::tokenize() } else if (is_num(*it)) { auto begin = it; - while (++it != end && is_num(*it)) { }; + while (++it != end && is_num(*it)) { } tokens.push_back({ .type = TokenType::num, .view { begin, it } }); --it; } @@ -124,7 +124,7 @@ std::vector Tokenizer::tokenize() return tokens; } -std::string Token::to_string() +std::string Token::to_string() const { switch (type) { case TokenType::lt: @@ -185,10 +185,10 @@ NodeDoc Parser::parse() } while (peek().has_value()) { if (auto elem = parse_elem()) { - doc.children.push_back(elem.value()); + doc.children.emplace_back(elem.value()); } else { - doc.children.push_back(consume().to_string()); + doc.children.emplace_back(Utf8String(consume().view)); } } return doc; @@ -222,12 +222,12 @@ bool is_ci_equal(const std::string& s1, const std::string& s2) std::optional Parser::parse_doc_type() { if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::exclaim) - && peek_is(2, TokenType::ident, "doctype", StrCmp::case_insensitive) && peek_is_with_val(3, TokenType::ident) + && peek_is(2, TokenType::ident, Utf8View("doctype"), StrCmp::case_insensitive) && peek_is(3, TokenType::ident) && peek_is(4, TokenType::gt)) { consume(); consume(); consume(); - NodeDocType doc_type { .type = consume().view }; + NodeDocType doc_type { .type = Utf8String(consume().view) }; consume(); return doc_type; } @@ -242,41 +242,40 @@ bool Parser::peek_is(size_t ahead, TokenType type) return peek(ahead).value().get().type == type; } -bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp) +bool Parser::peek_is(size_t ahead, TokenType type, const Utf8View& val, Parser::StrCmp cmp) { if (!peek_is(ahead, type)) { return false; } switch (cmp) { case StrCmp::case_sensitive: - return peek(ahead).value().get().value.value() == val; + return peek(ahead).value().get().view == val; case StrCmp::case_insensitive: - return is_ci_equal(peek(ahead).value().get().value.value(), val); + return peek(ahead).value().get().view.case_ins_equals(val); } } + std::optional Parser::parse_elem() { - if (peek_is(0, TokenType::lt) && peek_is_with_val(1, TokenType::ident)) { + if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::ident)) { consume(); - std::string tag = consume().value.value(); + Utf8View tag = consume().view; std::vector attributes; while (auto attr = parse_attr()) { attributes.push_back(attr.value()); } if (peek_is(0, TokenType::gt)) { consume(); - NodeElemReg elem_reg; - elem_reg.tag = tag; - elem_reg.attributes = std::move(attributes); + NodeElemReg elem_reg { .tag = Utf8String(tag), .attributes = std::move(attributes) }; while (peek().has_value() && !( peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash) && peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt))) { if (auto child = parse_elem()) { - elem_reg.inner.push_back(child.value()); + elem_reg.inner.emplace_back(child.value()); } else if (peek().has_value()) { - elem_reg.inner.push_back(consume().to_string()); + elem_reg.inner.emplace_back(Utf8String(consume().view)); } else { assert(false && "Unexpected"); @@ -288,8 +287,7 @@ std::optional Parser::parse_elem() consume(); consume(); consume(); - NodeElem elem; - elem.var = std::move(elem_reg); + NodeElem elem { .var = std::move(elem_reg) }; return elem; } else { @@ -297,16 +295,13 @@ std::optional Parser::parse_elem() } } else if (peek_is(0, TokenType::fslash)) { - NodeElemSelfClose elem_close; - elem_close.tag = tag; - elem_close.attributes = std::move(attributes); + NodeElemSelfClose elem_close { .tag = Utf8String(tag), .attributes = std::move(attributes) }; consume(); if (!peek_is(0, TokenType::gt)) { assert(false && "Unexpected"); } consume(); - NodeElem elem; - elem.var = std::move(elem_close); + NodeElem elem { .var = std::move(elem_close) }; return elem; } else { @@ -317,12 +312,12 @@ std::optional Parser::parse_elem() } std::optional Parser::parse_attr() { - if (peek_is_with_val(0, TokenType::ident) && peek_is(1, TokenType::eq) - && (peek_is_with_val(2, TokenType::ident) || peek_is_with_val(2, TokenType::str))) { - NodeAttr attr; - attr.key = consume().value.value(); + if (peek_is(0, TokenType::ident) && peek_is(1, TokenType::eq) + && (peek_is(2, TokenType::ident) || peek_is(2, TokenType::str))) { + Utf8String key = Utf8String(consume().view); consume(); - attr.val = consume().value.value(); + Utf8String val = Utf8String(consume().view); + NodeAttr attr { .key = key, .val = val }; return attr; } else { @@ -331,23 +326,3 @@ std::optional Parser::parse_attr() } } - -Utf8View::Utf8View(Utf8View::Iterator begin) - : m_begin(begin) - , m_end(Iterator(begin.base(), begin.base(), begin.base() + 1)) -{ -} - -Utf8View::Utf8View(Utf8View::Iterator begin, Utf8View::Iterator end) - : m_begin(begin) - , m_end(end) -{ -} -Utf8View::Iterator Utf8View::begin() const -{ - return m_begin; -} -Utf8View::Iterator Utf8View::end() const -{ - return m_end; -} diff --git a/src/html_parse.hpp b/src/html_parse.hpp index 649de6c..c2cbe7f 100644 --- a/src/html_parse.hpp +++ b/src/html_parse.hpp @@ -6,26 +6,12 @@ #include #include -#include - -class Utf8View { -public: - using Iterator = utf8::iterator; - - explicit Utf8View(Iterator begin); - Utf8View(Iterator begin, Iterator end); - - [[nodiscard]] Iterator begin() const; - [[nodiscard]] Iterator end() const; - -private: - Iterator m_begin; - Iterator m_end; -}; +#include "utf8.hpp" namespace html { enum class TokenType { + unknown, lt, gt, ident, @@ -48,10 +34,10 @@ enum class TokenType { }; struct Token { - TokenType type; + TokenType type = TokenType::unknown; Utf8View view; - std::string to_string(); + std::string to_string() const; }; class Tokenizer { @@ -65,24 +51,24 @@ private: }; struct NodeDocType { - Utf8View type; + Utf8String type; }; struct NodeAttr { - std::string key; - std::string val; + Utf8String key; + Utf8String val; }; struct NodeElem; struct NodeElemReg { - std::string tag; + Utf8String tag; std::vector attributes; - std::vector> inner; + std::vector> inner; }; struct NodeElemSelfClose { - std::string tag; + Utf8String tag; std::vector attributes; }; @@ -92,7 +78,7 @@ struct NodeElem { struct NodeDoc { std::optional doc_type {}; - std::vector> children {}; + std::vector> children {}; }; class Parser { @@ -111,7 +97,7 @@ private: enum class StrCmp { case_sensitive, case_insensitive }; bool peek_is(size_t ahead, TokenType type); - bool peek_is(size_t ahead, TokenType type, const std::string& val, StrCmp cmp); + bool peek_is(size_t ahead, TokenType type, const Utf8View& val, StrCmp cmp); Token& consume(); diff --git a/src/main.cpp b/src/main.cpp index 88c31a4..20895a0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,8 @@ int main() { init_curl(); - std::optional page_data = fetch_url("test://text.html"); + // std::optional page_data = fetch_url("test://text.html"); + std::optional page_data = fetch_url("https://example.com"); SetConfigFlags(ConfigFlags::FLAG_WINDOW_RESIZABLE | ConfigFlags::FLAG_MSAA_4X_HINT | ConfigFlags ::FLAG_VSYNC_HINT); @@ -26,7 +27,8 @@ int main() SetTargetFPS(60); bool is_editing_url = false; - std::string url_input = "test://text.html"; + std::string url_input = "https://example.com"; + // std::string url_input = "test://text.html"; url_input.reserve(1024); float scroll_pos = 0.0f; diff --git a/src/utf8.hpp b/src/utf8.hpp new file mode 100644 index 0000000..bcc3fb6 --- /dev/null +++ b/src/utf8.hpp @@ -0,0 +1,133 @@ +#include + +inline uint32_t utf8_to_lower(uint32_t c) +{ + if (c >= 65 && c <= 90) { + return c + 32; + } + return c; +} + +class Utf8View { +public: + using Iterator = utf8::iterator; + using ConstIterator = utf8::iterator; + + inline explicit Utf8View(const std::string& str) + : m_begin(str.begin()) + , m_end(str.end()) + { + } + + inline explicit Utf8View(const Iterator& begin) + : m_begin(begin.base()) + , m_end(begin.base() + 1) + { + } + + inline Utf8View(const Iterator& begin, const Iterator& end) + : m_begin(begin.base()) + , m_end(end.base()) + { + } + + [[nodiscard]] inline bool operator==(const Utf8View& other) const + { + if (size() != other.size()) { + return false; + } + for (int64_t i = 0; i < size(); i++) { + if ((*this)[i] != other[i]) { + return false; + } + } + return true; + } + + [[nodiscard]] inline bool case_ins_equals(const Utf8View& other) const + { + if (size() != other.size()) { + return false; + } + for (int64_t i = 0; i < size(); i++) { + if (utf8_to_lower((*this)[i]) != utf8_to_lower(other[i])) { + return false; + } + } + return true; + } + + [[nodiscard]] inline size_t size() const + { + return m_end - m_begin; + } + + [[nodiscard]] inline uint32_t operator[](int64_t index) const + { + return *(m_begin + index); + } + + [[nodiscard]] inline ConstIterator cbegin() const + { + return ConstIterator(m_begin, m_begin, m_end); + } + [[nodiscard]] inline ConstIterator cend() const + { + return ConstIterator(m_end, m_begin, m_end); + } + +private: + std::string::const_iterator m_begin; + std::string::const_iterator m_end; +}; + +class Utf8String { +public: + using Iterator = utf8::iterator; + using ConstIterator = utf8::iterator; + + explicit inline Utf8String(std::string str) + : m_str(std::move(str)) + { + } + + explicit inline Utf8String(const Utf8View& view) + : m_str(view.cbegin().base(), view.cend().base()) + { + } + + inline Iterator begin() + { + return Iterator(m_str.begin(), m_str.begin(), m_str.end()); + } + inline Iterator end() + { + return Iterator(m_str.end(), m_str.begin(), m_str.end()); + } + + [[nodiscard]] inline ConstIterator cbegin() const + { + return ConstIterator(m_str.begin(), m_str.begin(), m_str.end()); + } + [[nodiscard]] inline ConstIterator cend() const + { + return ConstIterator(m_str.end(), m_str.begin(), m_str.end()); + } + + bool operator==(const Utf8String& other) const + { + return m_str == other.m_str; + } + + uint32_t operator[](size_t index) + { + auto it = begin(); + for (size_t i = 0; i < index; i++) { + ++it; + } + return *it; + } + +private: + std::string m_str; +}; \ No newline at end of file