summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authortastytea2020-01-15 19:09:43 +0100
committertastytea2020-01-15 19:09:43 +0100
commit343fe2adca305954c3cb5ff37a6f1214c09599a3 (patch)
tree9901d6f9d2453daf6dc7f81e7995c03cedd7b9b9
parentfad9f87391e4a064ac37795606ed8a1bb8197dcb (diff)
downloadmastodonpp-343fe2adca305954c3cb5ff37a6f1214c09599a3.tar
mastodonpp-343fe2adca305954c3cb5ff37a6f1214c09599a3.tar.gz
mastodonpp-343fe2adca305954c3cb5ff37a6f1214c09599a3.zip
Add unescape_html().
-rw-r--r--include/helpers.hpp41
-rw-r--r--include/mastodonpp.hpp1
-rw-r--r--src/helpers.cpp342
3 files changed, 384 insertions, 0 deletions
diff --git a/include/helpers.hpp b/include/helpers.hpp
new file mode 100644
index 0000000..f962256
--- /dev/null
+++ b/include/helpers.hpp
@@ -0,0 +1,41 @@
+/* This file is part of mastodonpp.
+ * Copyright © 2020 tastytea <tastytea@tastytea.de>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, version 3.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MASTODONPP_HELPERS_HPP
+#define MASTODONPP_HELPERS_HPP
+
+#include <string>
+#include <string_view>
+
+namespace mastodonpp
+{
+
+using std::string;
+
+/*!
+ * @brief Replaces HTML entities with UTF-8 characters.
+ *
+ * Supports named and numbered entities, decimal and hexadecimal.
+ *
+ * @param html The HTML to unescape.
+ *
+ * @since 0.4.0
+ */
+const string unescape_html(string html);
+
+} // namespace mastodonpp
+
+#endif // MASTODONPP_HELPERS_HPP
diff --git a/include/mastodonpp.hpp b/include/mastodonpp.hpp
index 65c1af7..d9655b3 100644
--- a/include/mastodonpp.hpp
+++ b/include/mastodonpp.hpp
@@ -20,6 +20,7 @@
#include "api.hpp"
#include "connection.hpp"
#include "exceptions.hpp"
+#include "helpers.hpp"
#include "instance.hpp"
#include "types.hpp"
diff --git a/src/helpers.cpp b/src/helpers.cpp
new file mode 100644
index 0000000..70fdf7d
--- /dev/null
+++ b/src/helpers.cpp
@@ -0,0 +1,342 @@
+/* This file is part of mastodonpp.
+ * Copyright © 2020 tastytea <tastytea@tastytea.de>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, version 3.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "helpers.hpp"
+
+#include <array>
+#include <codecvt>
+#include <locale>
+#include <regex>
+#include <string_view>
+#include <utility>
+
+namespace mastodonpp
+{
+
+using std::array;
+using std::stol;
+using std::codecvt_utf8;
+using std::wstring_convert;
+using std::regex;
+using std::regex_search;
+using std::smatch;
+using std::string_view;
+using std::move;
+using std::pair;
+
+const string unescape_html(string html)
+{
+ string buffer{move(html)};
+ string output;
+
+ // Used to convert int to utf-8 char.
+ wstring_convert<codecvt_utf8<char32_t>, char32_t> u8c;
+ // Matches numbered entities between 1 and 8 digits, decimal or hexadecimal.
+ const regex re_entity{"&#(x)?([[:alnum:]]{1,8});"};
+ smatch match;
+
+ while (regex_search(buffer, match, re_entity))
+ {
+ const char32_t codepoint{[&match]
+ {
+ // 'x' in front of the number means it's hexadecimal, else decimal.
+ if (match[1].length() == 1)
+ {
+ return static_cast<char32_t>(stol(match[2].str(), nullptr, 16));
+ }
+ else
+ {
+ return static_cast<char32_t>(stol(match[2].str(), nullptr, 10));
+ }
+ }()};
+ output += match.prefix().str() + u8c.to_bytes(codepoint);
+ buffer = match.suffix().str();
+ }
+ output += buffer;
+
+ // Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_
+ // entity_references#Character_entity_references_in_HTML
+ constexpr array<const pair<const string_view, const char32_t>, 258> names
+ {{
+ { "exclamation", 0x0021 },
+ { "quot", 0x0022 },
+ { "percent", 0x0025 },
+ { "amp", 0x0026 },
+ { "apos", 0x0027 },
+ { "add", 0x002B },
+ { "lt", 0x003C },
+ { "equal", 0x003D },
+ { "gt", 0x003E },
+ { "nbsp", 0x00A0 },
+ { "iexcl", 0x00A1 },
+ { "cent", 0x00A2 },
+ { "pound", 0x00A3 },
+ { "curren", 0x00A4 },
+ { "yen", 0x00A5 },
+ { "brvbar", 0x00A6 },
+ { "sect", 0x00A7 },
+ { "uml", 0x00A8 },
+ { "copy", 0x00A9 },
+ { "ordf", 0x00AA },
+ { "laquo", 0x00AB },
+ { "not", 0x00AC },
+ { "shy", 0x00AD },
+ { "reg", 0x00AE },
+ { "macr", 0x00AF },
+ { "deg", 0x00B0 },
+ { "plusmn", 0x00B1 },
+ { "sup2", 0x00B2 },
+ { "sup3", 0x00B3 },
+ { "acute", 0x00B4 },
+ { "micro", 0x00B5 },
+ { "para", 0x00B6 },
+ { "middot", 0x00B7 },
+ { "cedil", 0x00B8 },
+ { "sup1", 0x00B9 },
+ { "ordm", 0x00BA },
+ { "raquo", 0x00BB },
+ { "frac14", 0x00BC },
+ { "frac12", 0x00BD },
+ { "frac34", 0x00BE },
+ { "iquest", 0x00BF },
+ { "Agrave", 0x00C0 },
+ { "Aacute", 0x00C1 },
+ { "Acirc", 0x00C2 },
+ { "Atilde", 0x00C3 },
+ { "Auml", 0x00C4 },
+ { "Aring", 0x00C5 },
+ { "AElig", 0x00C6 },
+ { "Ccedil", 0x00C7 },
+ { "Egrave", 0x00C8 },
+ { "Eacute", 0x00C9 },
+ { "Ecirc", 0x00CA },
+ { "Euml", 0x00CB },
+ { "Igrave", 0x00CC },
+ { "Iacute", 0x00CD },
+ { "Icirc", 0x00CE },
+ { "Iuml", 0x00CF },
+ { "ETH", 0x00D0 },
+ { "Ntilde", 0x00D1 },
+ { "Ograve", 0x00D2 },
+ { "Oacute", 0x00D3 },
+ { "Ocirc", 0x00D4 },
+ { "Otilde", 0x00D5 },
+ { "Ouml", 0x00D6 },
+ { "times", 0x00D7 },
+ { "Oslash", 0x00D8 },
+ { "Ugrave", 0x00D9 },
+ { "Uacute", 0x00DA },
+ { "Ucirc", 0x00DB },
+ { "Uuml", 0x00DC },
+ { "Yacute", 0x00DD },
+ { "THORN", 0x00DE },
+ { "szlig", 0x00DF },
+ { "agrave", 0x00E0 },
+ { "aacute", 0x00E1 },
+ { "acirc", 0x00E2 },
+ { "atilde", 0x00E3 },
+ { "auml", 0x00E4 },
+ { "aring", 0x00E5 },
+ { "aelig", 0x00E6 },
+ { "ccedil", 0x00E7 },
+ { "egrave", 0x00E8 },
+ { "eacute", 0x00E9 },
+ { "ecirc", 0x00EA },
+ { "euml", 0x00EB },
+ { "igrave", 0x00EC },
+ { "iacute", 0x00ED },
+ { "icirc", 0x00EE },
+ { "iuml", 0x00EF },
+ { "eth", 0x00F0 },
+ { "ntilde", 0x00F1 },
+ { "ograve", 0x00F2 },
+ { "oacute", 0x00F3 },
+ { "ocirc", 0x00F4 },
+ { "otilde", 0x00F5 },
+ { "ouml", 0x00F6 },
+ { "divide", 0x00F7 },
+ { "oslash", 0x00F8 },
+ { "ugrave", 0x00F9 },
+ { "uacute", 0x00FA },
+ { "ucirc", 0x00FB },
+ { "uuml", 0x00FC },
+ { "yacute", 0x00FD },
+ { "thorn", 0x00FE },
+ { "yuml", 0x00FF },
+ { "OElig", 0x0152 },
+ { "oelig", 0x0153 },
+ { "Scaron", 0x0160 },
+ { "scaron", 0x0161 },
+ { "Yuml", 0x0178 },
+ { "fnof", 0x0192 },
+ { "circ", 0x02C6 },
+ { "tilde", 0x02DC },
+ { "Alpha", 0x0391 },
+ { "Beta", 0x0392 },
+ { "Gamma", 0x0393 },
+ { "Delta", 0x0394 },
+ { "Epsilon", 0x0395 },
+ { "Zeta", 0x0396 },
+ { "Eta", 0x0397 },
+ { "Theta", 0x0398 },
+ { "Iota", 0x0399 },
+ { "Kappa", 0x039A },
+ { "Lambda", 0x039B },
+ { "Mu", 0x039C },
+ { "Nu", 0x039D },
+ { "Xi", 0x039E },
+ { "Omicron", 0x039F },
+ { "Pi", 0x03A0 },
+ { "Rho", 0x03A1 },
+ { "Sigma", 0x03A3 },
+ { "Tau", 0x03A4 },
+ { "Upsilon", 0x03A5 },
+ { "Phi", 0x03A6 },
+ { "Chi", 0x03A7 },
+ { "Psi", 0x03A8 },
+ { "Omega", 0x03A9 },
+ { "alpha", 0x03B1 },
+ { "beta", 0x03B2 },
+ { "gamma", 0x03B3 },
+ { "delta", 0x03B4 },
+ { "epsilon", 0x03B5 },
+ { "zeta", 0x03B6 },
+ { "eta", 0x03B7 },
+ { "theta", 0x03B8 },
+ { "iota", 0x03B9 },
+ { "kappa", 0x03BA },
+ { "lambda", 0x03BB },
+ { "mu", 0x03BC },
+ { "nu", 0x03BD },
+ { "xi", 0x03BE },
+ { "omicron", 0x03BF },
+ { "pi", 0x03C0 },
+ { "rho", 0x03C1 },
+ { "sigmaf", 0x03C2 },
+ { "sigma", 0x03C3 },
+ { "tau", 0x03C4 },
+ { "upsilon", 0x03C5 },
+ { "phi", 0x03C6 },
+ { "chi", 0x03C7 },
+ { "psi", 0x03C8 },
+ { "omega", 0x03C9 },
+ { "thetasym", 0x03D1 },
+ { "upsih", 0x03D2 },
+ { "piv", 0x03D6 },
+ { "ensp", 0x2002 },
+ { "emsp", 0x2003 },
+ { "thinsp", 0x2009 },
+ { "zwnj", 0x200C },
+ { "zwj", 0x200D },
+ { "lrm", 0x200E },
+ { "rlm", 0x200F },
+ { "ndash", 0x2013 },
+ { "mdash", 0x2014 },
+ { "horbar", 0x2015 },
+ { "lsquo", 0x2018 },
+ { "rsquo", 0x2019 },
+ { "sbquo", 0x201A },
+ { "ldquo", 0x201C },
+ { "rdquo", 0x201D },
+ { "bdquo", 0x201E },
+ { "dagger", 0x2020 },
+ { "Dagger", 0x2021 },
+ { "bull", 0x2022 },
+ { "hellip", 0x2026 },
+ { "permil", 0x2030 },
+ { "prime", 0x2032 },
+ { "Prime", 0x2033 },
+ { "lsaquo", 0x2039 },
+ { "rsaquo", 0x203A },
+ { "oline", 0x203E },
+ { "frasl", 0x2044 },
+ { "euro", 0x20AC },
+ { "image", 0x2111 },
+ { "weierp", 0x2118 },
+ { "real", 0x211C },
+ { "trade", 0x2122 },
+ { "alefsym", 0x2135 },
+ { "larr", 0x2190 },
+ { "uarr", 0x2191 },
+ { "rarr", 0x2192 },
+ { "darr", 0x2193 },
+ { "harr", 0x2194 },
+ { "crarr", 0x21B5 },
+ { "lArr", 0x21D0 },
+ { "uArr", 0x21D1 },
+ { "rArr", 0x21D2 },
+ { "dArr", 0x21D3 },
+ { "hArr", 0x21D4 },
+ { "forall", 0x2200 },
+ { "part", 0x2202 },
+ { "exist", 0x2203 },
+ { "empty", 0x2205 },
+ { "nabla", 0x2207 },
+ { "isin", 0x2208 },
+ { "notin", 0x2209 },
+ { "ni", 0x220B },
+ { "prod", 0x220F },
+ { "sum", 0x2211 },
+ { "minus", 0x2212 },
+ { "lowast", 0x2217 },
+ { "radic", 0x221A },
+ { "prop", 0x221D },
+ { "infin", 0x221E },
+ { "ang", 0x2220 },
+ { "and", 0x2227 },
+ { "or", 0x2228 },
+ { "cap", 0x2229 },
+ { "cup", 0x222A },
+ { "int", 0x222B },
+ { "there4", 0x2234 },
+ { "sim", 0x223C },
+ { "cong", 0x2245 },
+ { "asymp", 0x2248 },
+ { "ne", 0x2260 },
+ { "equiv", 0x2261 },
+ { "le", 0x2264 },
+ { "ge", 0x2265 },
+ { "sub", 0x2282 },
+ { "sup", 0x2283 },
+ { "nsub", 0x2284 },
+ { "sube", 0x2286 },
+ { "supe", 0x2287 },
+ { "oplus", 0x2295 },
+ { "otimes", 0x2297 },
+ { "perp", 0x22A5 },
+ { "sdot", 0x22C5 },
+ { "lceil", 0x2308 },
+ { "rceil", 0x2309 },
+ { "lfloor", 0x230A },
+ { "rfloor", 0x230B },
+ { "lang", 0x2329 },
+ { "rang", 0x232A },
+ { "loz", 0x25CA },
+ { "spades", 0x2660 },
+ { "clubs", 0x2663 },
+ { "hearts", 0x2665 },
+ { "diams", 0x2666 }
+ }};
+
+ for (const auto &pair : names)
+ {
+ const regex re((string("&") += pair.first) += ';');
+ output = regex_replace(output, re, u8c.to_bytes(pair.second));
+ }
+
+ return output;
+}
+} // namespace mastodonpp