Compare commits

...

2 Commits

Author SHA1 Message Date
fb359223f0 Implement Utf8String 2023-10-04 18:34:35 -04:00
3407b86171 Initial utf8view 2023-10-04 14:59:32 -04:00
5 changed files with 204 additions and 93 deletions

View File

@ -20,7 +20,8 @@ add_subdirectory(external/raylib-cpp-4.5.1)
add_subdirectory(external/utfcpp-3.2.5) add_subdirectory(external/utfcpp-3.2.5)
add_executable(browser src/main.cpp src/fetch.cpp add_executable(browser src/main.cpp src/fetch.cpp
src/html_parse.cpp) src/html_parse.cpp
src/utf8.hpp)
target_include_directories(browser PRIVATE external/raygui-4.0/src) target_include_directories(browser PRIVATE external/raygui-4.0/src)

View File

@ -1,8 +1,8 @@
#include "html_parse.hpp" #include "html_parse.hpp"
#include <cassert> #include <cassert>
#include <charconv>
#include <iostream> #include <iostream>
#include <utility>
#include <utf8.h> #include <utf8.h>
@ -40,7 +40,6 @@ static bool is_newline(char32_t c)
std::vector<Token> Tokenizer::tokenize() std::vector<Token> Tokenizer::tokenize()
{ {
std::string buffer;
std::vector<Token> tokens; std::vector<Token> tokens;
utf8::iterator<std::string::iterator> it(m_source.begin(), m_source.begin(), m_source.end()); utf8::iterator<std::string::iterator> it(m_source.begin(), m_source.begin(), m_source.end());
@ -48,91 +47,84 @@ std::vector<Token> Tokenizer::tokenize()
for (; it != end; ++it) { for (; it != end; ++it) {
if (is_alpha(*it)) { if (is_alpha(*it)) {
do { auto begin = it;
utf8::append(*it, std::back_inserter(buffer)); while (++it != end && (is_alpha_num(*it) || *it == '-')) { }
} while (++it != end && (is_alpha_num(*it) || *it == '-')); tokens.push_back({ .type = TokenType::ident, .view { begin, it } });
--it; --it;
tokens.push_back({ .type = TokenType::ident, .value = buffer });
buffer.clear();
} }
else if (*it == '<') { else if (*it == '<') {
tokens.push_back({ .type = TokenType::lt }); tokens.push_back({ .type = TokenType::lt, .view { it } });
} }
else if (*it == '>') { else if (*it == '>') {
tokens.push_back({ .type = TokenType::gt }); tokens.push_back({ .type = TokenType::gt, .view { it } });
} }
else if (*it == '!') { else if (*it == '!') {
tokens.push_back({ .type = TokenType::exclaim }); tokens.push_back({ .type = TokenType::exclaim, .view { it } });
} }
else if (*it == '/') { else if (*it == '/') {
tokens.push_back({ .type = TokenType::fslash }); tokens.push_back({ .type = TokenType::fslash, .view { it } });
} }
else if (*it == '=') { else if (*it == '=') {
tokens.push_back({ .type = TokenType::eq }); tokens.push_back({ .type = TokenType::eq, .view { it } });
} }
else if (*it == '-') { else if (*it == '-') {
tokens.push_back({ .type = TokenType::minus }); tokens.push_back({ .type = TokenType::minus, .view { it } });
} }
else if (*it == ':') { else if (*it == ':') {
tokens.push_back({ .type = TokenType::colon }); tokens.push_back({ .type = TokenType::colon, .view { it } });
} }
else if (*it == '#') { else if (*it == '#') {
tokens.push_back({ .type = TokenType::hash }); tokens.push_back({ .type = TokenType::hash, .view { it } });
} }
else if (*it == ';') { else if (*it == ';') {
tokens.push_back({ .type = TokenType::semi }); tokens.push_back({ .type = TokenType::semi, .view { it } });
} }
else if (is_num(*it)) { else if (is_num(*it)) {
do { auto begin = it;
utf8::append(*it, std::back_inserter(buffer)); while (++it != end && is_num(*it)) { }
} while (++it != end && is_num(*it)); tokens.push_back({ .type = TokenType::num, .view { begin, it } });
--it; --it;
tokens.push_back({ .type = TokenType::num, .value = buffer });
buffer.clear();
} }
else if (*it == ',') { else if (*it == ',') {
tokens.push_back({ .type = TokenType::comma }); tokens.push_back({ .type = TokenType::comma, .view { it } });
} }
else if (*it == '{') { else if (*it == '{') {
tokens.push_back({ .type = TokenType::left_curly }); tokens.push_back({ .type = TokenType::left_curly, .view { it } });
} }
else if (*it == '}') { else if (*it == '}') {
tokens.push_back({ .type = TokenType::right_curly }); tokens.push_back({ .type = TokenType::right_curly, .view { it } });
} }
else if (*it == '(') { else if (*it == '(') {
tokens.push_back({ .type = TokenType::left_paren }); tokens.push_back({ .type = TokenType::left_paren, .view { it } });
} }
else if (*it == ')') { else if (*it == ')') {
tokens.push_back({ .type = TokenType::right_paren }); tokens.push_back({ .type = TokenType::right_paren, .view { it } });
} }
else if (*it == '.') { else if (*it == '.') {
tokens.push_back({ .type = TokenType::dot }); tokens.push_back({ .type = TokenType::dot, .view { it } });
} }
else if (*it == '@') { else if (*it == '@') {
tokens.push_back({ .type = TokenType::at }); tokens.push_back({ .type = TokenType::at, .view { it } });
} }
else if (*it == '"') { else if (*it == '"') {
auto begin = it;
++it; ++it;
while (it != end && *it != '"') { while (it != end && *it != '"') {
utf8::append(*it, std::back_inserter(buffer));
++it; ++it;
} }
tokens.push_back({ .type = TokenType::str, .value = buffer }); tokens.push_back({ .type = TokenType::str, .view { begin, it } });
buffer.clear();
} }
else if (is_space(*it) || is_newline(*it)) { else if (is_space(*it) || is_newline(*it)) {
} }
else { else {
utf8::append(*it, std::back_inserter(buffer)); std::cout << "[WARN] Unexpected token" << std::endl;
std::cout << "[WARN] Unexpected token: " << buffer << std::endl;
buffer.clear();
} }
} }
return tokens; return tokens;
} }
std::string Token::to_string() std::string Token::to_string() const
{ {
switch (type) { switch (type) {
case TokenType::lt: case TokenType::lt:
@ -140,13 +132,11 @@ std::string Token::to_string()
case TokenType::gt: case TokenType::gt:
return ">"; return ">";
case TokenType::ident: case TokenType::ident:
assert(value.has_value()); return "IDENT"; // TODO
return value.value();
case TokenType::fslash: case TokenType::fslash:
return "/"; return "/";
case TokenType::str: case TokenType::str:
assert(value.has_value()); return "STR"; // TODO
return value.value();
case TokenType::exclaim: case TokenType::exclaim:
return "!"; return "!";
case TokenType::eq: case TokenType::eq:
@ -174,8 +164,7 @@ std::string Token::to_string()
case TokenType::semi: case TokenType::semi:
return ";"; return ";";
case TokenType::num: case TokenType::num:
assert(value.has_value()); return "NUM"; // TODO
return value.value();
default: default:
assert(false && "Unimplemented"); assert(false && "Unimplemented");
return ""; return "";
@ -196,10 +185,10 @@ NodeDoc Parser::parse()
} }
while (peek().has_value()) { while (peek().has_value()) {
if (auto elem = parse_elem()) { if (auto elem = parse_elem()) {
doc.children.push_back(elem.value()); doc.children.emplace_back(elem.value());
} }
else { else {
doc.children.push_back(consume().to_string()); doc.children.emplace_back(Utf8String(consume().view));
} }
} }
return doc; return doc;
@ -233,13 +222,12 @@ bool is_ci_equal(const std::string& s1, const std::string& s2)
std::optional<NodeDocType> Parser::parse_doc_type() std::optional<NodeDocType> Parser::parse_doc_type()
{ {
if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::exclaim) if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::exclaim)
&& peek_is(2, TokenType::ident, "doctype", StrCmp::case_insensitive) && peek_is_with_val(3, TokenType::ident) && peek_is(2, TokenType::ident, Utf8View("doctype"), StrCmp::case_insensitive) && peek_is(3, TokenType::ident)
&& peek_is(4, TokenType::gt)) { && peek_is(4, TokenType::gt)) {
consume(); consume();
consume(); consume();
consume(); consume();
NodeDocType doc_type; NodeDocType doc_type { .type = Utf8String(consume().view) };
doc_type.type = consume().value.value();
consume(); consume();
return doc_type; return doc_type;
} }
@ -254,49 +242,40 @@ bool Parser::peek_is(size_t ahead, TokenType type)
return peek(ahead).value().get().type == type; return peek(ahead).value().get().type == type;
} }
bool Parser::peek_is_with_val(size_t ahead, TokenType type) bool Parser::peek_is(size_t ahead, TokenType type, const Utf8View& val, Parser::StrCmp cmp)
{ {
if (!peek_is(ahead, type)) { if (!peek_is(ahead, type)) {
return false; return false;
} }
return peek(ahead).value().get().value.has_value();
}
bool Parser::peek_is(size_t ahead, TokenType type, const std::string& val, Parser::StrCmp cmp)
{
if (!peek_is_with_val(ahead, type)) {
return false;
}
switch (cmp) { switch (cmp) {
case StrCmp::case_sensitive: case StrCmp::case_sensitive:
return peek(ahead).value().get().value.value() == val; return peek(ahead).value().get().view == val;
case StrCmp::case_insensitive: case StrCmp::case_insensitive:
return is_ci_equal(peek(ahead).value().get().value.value(), val); return peek(ahead).value().get().view.case_ins_equals(val);
} }
} }
std::optional<NodeElem> Parser::parse_elem() std::optional<NodeElem> Parser::parse_elem()
{ {
if (peek_is(0, TokenType::lt) && peek_is_with_val(1, TokenType::ident)) { if (peek_is(0, TokenType::lt) && peek_is(1, TokenType::ident)) {
consume(); consume();
std::string tag = consume().value.value(); Utf8View tag = consume().view;
std::vector<NodeAttr> attributes; std::vector<NodeAttr> attributes;
while (auto attr = parse_attr()) { while (auto attr = parse_attr()) {
attributes.push_back(attr.value()); attributes.push_back(attr.value());
} }
if (peek_is(0, TokenType::gt)) { if (peek_is(0, TokenType::gt)) {
consume(); consume();
NodeElemReg elem_reg; NodeElemReg elem_reg { .tag = Utf8String(tag), .attributes = std::move(attributes) };
elem_reg.tag = tag;
elem_reg.attributes = std::move(attributes);
while (peek().has_value() while (peek().has_value()
&& !( && !(
peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash) peek_is(0, TokenType::lt) && peek_is(1, TokenType::fslash)
&& peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt))) { && peek_is(2, TokenType::ident, tag, StrCmp::case_insensitive) && peek_is(3, TokenType::gt))) {
if (auto child = parse_elem()) { if (auto child = parse_elem()) {
elem_reg.inner.push_back(child.value()); elem_reg.inner.emplace_back(child.value());
} }
else if (peek().has_value()) { else if (peek().has_value()) {
elem_reg.inner.push_back(consume().to_string()); elem_reg.inner.emplace_back(Utf8String(consume().view));
} }
else { else {
assert(false && "Unexpected"); assert(false && "Unexpected");
@ -308,8 +287,7 @@ std::optional<NodeElem> Parser::parse_elem()
consume(); consume();
consume(); consume();
consume(); consume();
NodeElem elem; NodeElem elem { .var = std::move(elem_reg) };
elem.var = std::move(elem_reg);
return elem; return elem;
} }
else { else {
@ -317,16 +295,13 @@ std::optional<NodeElem> Parser::parse_elem()
} }
} }
else if (peek_is(0, TokenType::fslash)) { else if (peek_is(0, TokenType::fslash)) {
NodeElemSelfClose elem_close; NodeElemSelfClose elem_close { .tag = Utf8String(tag), .attributes = std::move(attributes) };
elem_close.tag = tag;
elem_close.attributes = std::move(attributes);
consume(); consume();
if (!peek_is(0, TokenType::gt)) { if (!peek_is(0, TokenType::gt)) {
assert(false && "Unexpected"); assert(false && "Unexpected");
} }
consume(); consume();
NodeElem elem; NodeElem elem { .var = std::move(elem_close) };
elem.var = std::move(elem_close);
return elem; return elem;
} }
else { else {
@ -337,12 +312,12 @@ std::optional<NodeElem> Parser::parse_elem()
} }
std::optional<NodeAttr> Parser::parse_attr() std::optional<NodeAttr> Parser::parse_attr()
{ {
if (peek_is_with_val(0, TokenType::ident) && peek_is(1, TokenType::eq) if (peek_is(0, TokenType::ident) && peek_is(1, TokenType::eq)
&& (peek_is_with_val(2, TokenType::ident) || peek_is_with_val(2, TokenType::str))) { && (peek_is(2, TokenType::ident) || peek_is(2, TokenType::str))) {
NodeAttr attr; Utf8String key = Utf8String(consume().view);
attr.key = consume().value.value();
consume(); consume();
attr.val = consume().value.value(); Utf8String val = Utf8String(consume().view);
NodeAttr attr { .key = key, .val = val };
return attr; return attr;
} }
else { else {

View File

@ -6,11 +6,12 @@
#include <variant> #include <variant>
#include <vector> #include <vector>
#include <utf8.h> #include "utf8.hpp"
namespace html { namespace html {
enum class TokenType { enum class TokenType {
unknown,
lt, lt,
gt, gt,
ident, ident,
@ -33,10 +34,10 @@ enum class TokenType {
}; };
struct Token { struct Token {
TokenType type; TokenType type = TokenType::unknown;
std::optional<std::string> value = {}; Utf8View view;
std::string to_string(); std::string to_string() const;
}; };
class Tokenizer { class Tokenizer {
@ -50,24 +51,24 @@ private:
}; };
struct NodeDocType { struct NodeDocType {
std::string type; Utf8String type;
}; };
struct NodeAttr { struct NodeAttr {
std::string key; Utf8String key;
std::string val; Utf8String val;
}; };
struct NodeElem; struct NodeElem;
struct NodeElemReg { struct NodeElemReg {
std::string tag; Utf8String tag;
std::vector<NodeAttr> attributes; std::vector<NodeAttr> attributes;
std::vector<std::variant<NodeElem, std::string>> inner; std::vector<std::variant<NodeElem, Utf8String>> inner;
}; };
struct NodeElemSelfClose { struct NodeElemSelfClose {
std::string tag; Utf8String tag;
std::vector<NodeAttr> attributes; std::vector<NodeAttr> attributes;
}; };
@ -77,7 +78,7 @@ struct NodeElem {
struct NodeDoc { struct NodeDoc {
std::optional<NodeDocType> doc_type {}; std::optional<NodeDocType> doc_type {};
std::vector<std::variant<NodeElem, std::string>> children {}; std::vector<std::variant<NodeElem, Utf8String>> children {};
}; };
class Parser { class Parser {
@ -96,8 +97,7 @@ private:
enum class StrCmp { case_sensitive, case_insensitive }; enum class StrCmp { case_sensitive, case_insensitive };
bool peek_is(size_t ahead, TokenType type); bool peek_is(size_t ahead, TokenType type);
bool peek_is_with_val(size_t ahead, TokenType type); bool peek_is(size_t ahead, TokenType type, const Utf8View& val, StrCmp cmp);
bool peek_is(size_t ahead, TokenType type, const std::string& val, StrCmp cmp);
Token& consume(); Token& consume();

View File

@ -13,7 +13,8 @@ int main()
{ {
init_curl(); init_curl();
std::optional<std::string> page_data = fetch_url("test://text.html"); // std::optional<std::string> page_data = fetch_url("test://text.html");
std::optional<std::string> page_data = fetch_url("https://example.com");
SetConfigFlags(ConfigFlags::FLAG_WINDOW_RESIZABLE | ConfigFlags::FLAG_MSAA_4X_HINT | ConfigFlags ::FLAG_VSYNC_HINT); SetConfigFlags(ConfigFlags::FLAG_WINDOW_RESIZABLE | ConfigFlags::FLAG_MSAA_4X_HINT | ConfigFlags ::FLAG_VSYNC_HINT);
@ -26,7 +27,8 @@ int main()
SetTargetFPS(60); SetTargetFPS(60);
bool is_editing_url = false; bool is_editing_url = false;
std::string url_input = "test://text.html"; std::string url_input = "https://example.com";
// std::string url_input = "test://text.html";
url_input.reserve(1024); url_input.reserve(1024);
float scroll_pos = 0.0f; float scroll_pos = 0.0f;

133
src/utf8.hpp Normal file
View File

@ -0,0 +1,133 @@
#include <utf8.h>
inline uint32_t utf8_to_lower(uint32_t c)
{
if (c >= 65 && c <= 90) {
return c + 32;
}
return c;
}
class Utf8View {
public:
using Iterator = utf8::iterator<std::string::iterator>;
using ConstIterator = utf8::iterator<std::string::const_iterator>;
inline explicit Utf8View(const std::string& str)
: m_begin(str.begin())
, m_end(str.end())
{
}
inline explicit Utf8View(const Iterator& begin)
: m_begin(begin.base())
, m_end(begin.base() + 1)
{
}
inline Utf8View(const Iterator& begin, const Iterator& end)
: m_begin(begin.base())
, m_end(end.base())
{
}
[[nodiscard]] inline bool operator==(const Utf8View& other) const
{
if (size() != other.size()) {
return false;
}
for (int64_t i = 0; i < size(); i++) {
if ((*this)[i] != other[i]) {
return false;
}
}
return true;
}
[[nodiscard]] inline bool case_ins_equals(const Utf8View& other) const
{
if (size() != other.size()) {
return false;
}
for (int64_t i = 0; i < size(); i++) {
if (utf8_to_lower((*this)[i]) != utf8_to_lower(other[i])) {
return false;
}
}
return true;
}
[[nodiscard]] inline size_t size() const
{
return m_end - m_begin;
}
[[nodiscard]] inline uint32_t operator[](int64_t index) const
{
return *(m_begin + index);
}
[[nodiscard]] inline ConstIterator cbegin() const
{
return ConstIterator(m_begin, m_begin, m_end);
}
[[nodiscard]] inline ConstIterator cend() const
{
return ConstIterator(m_end, m_begin, m_end);
}
private:
std::string::const_iterator m_begin;
std::string::const_iterator m_end;
};
class Utf8String {
public:
using Iterator = utf8::iterator<std::string::iterator>;
using ConstIterator = utf8::iterator<std::string::const_iterator>;
explicit inline Utf8String(std::string str)
: m_str(std::move(str))
{
}
explicit inline Utf8String(const Utf8View& view)
: m_str(view.cbegin().base(), view.cend().base())
{
}
inline Iterator begin()
{
return Iterator(m_str.begin(), m_str.begin(), m_str.end());
}
inline Iterator end()
{
return Iterator(m_str.end(), m_str.begin(), m_str.end());
}
[[nodiscard]] inline ConstIterator cbegin() const
{
return ConstIterator(m_str.begin(), m_str.begin(), m_str.end());
}
[[nodiscard]] inline ConstIterator cend() const
{
return ConstIterator(m_str.end(), m_str.begin(), m_str.end());
}
bool operator==(const Utf8String& other) const
{
return m_str == other.m_str;
}
uint32_t operator[](size_t index)
{
auto it = begin();
for (size_t i = 0; i < index; i++) {
++it;
}
return *it;
}
private:
std::string m_str;
};