mirror of
https://git.jami.net/savoirfairelinux/jami-client-qt.git
synced 2025-08-03 22:35:45 +02:00
messageparser: replace regexes with tidy API
Depend on tidy API for attribute extraction rather than regexes. 1. htmlparser methods return nodes instead of pre-parsed strings 2. htmlparser provides some methods to extract text/attr from nodes Gitlab: #1248 Change-Id: I367d703680938fb0b7c5055ac41e079c1322da30
This commit is contained in:
parent
ec0feef74d
commit
61126cfa64
5 changed files with 150 additions and 78 deletions
|
@ -39,6 +39,7 @@ public:
|
|||
doc_ = tidyCreate();
|
||||
tidyOptSetBool(doc_, TidyQuiet, yes);
|
||||
tidyOptSetBool(doc_, TidyShowWarnings, no);
|
||||
tidyOptSetInt(doc_, TidyUseCustomTags, TidyCustomEmpty);
|
||||
}
|
||||
|
||||
~HtmlParser()
|
||||
|
@ -51,46 +52,88 @@ public:
|
|||
return tidyParseString(doc_, html.toLocal8Bit().data()) >= 0;
|
||||
}
|
||||
|
||||
using TagInfoList = QMap<TidyTagId, QList<QString>>;
|
||||
using TagNodeList = QMap<TidyTagId, QList<TidyNode>>;
|
||||
|
||||
// A function that traverses the DOM tree and fills a QVariantMap with a list
|
||||
// of the tags and their values. The result is structured as follows:
|
||||
// {tagId1: ["tagValue1", "tagValue2", ...],
|
||||
// tagId: ["tagValue1", "tagValue2", ...],
|
||||
// of the tags and their nodes. The result is structured as follows:
|
||||
// {tagId1: [tagNode1, tagNode2, ...],
|
||||
// tagId2: [tagNode3, tagNode4, ...],
|
||||
// ... }
|
||||
TagInfoList getTags(QList<TidyTagId> tags, int maxDepth = -1)
|
||||
TagNodeList getTagsNodes(const QList<TidyTagId>& tags, int maxDepth = -1)
|
||||
{
|
||||
TagInfoList result;
|
||||
TagNodeList result;
|
||||
traverseNode(
|
||||
tidyGetRoot(doc_),
|
||||
tags,
|
||||
[&result](const QString& value, TidyTagId tag) { result[tag].append(value); },
|
||||
[&result](TidyNode node, TidyTagId tag) { result[tag].append(node); },
|
||||
maxDepth);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
QString getFirstTagValue(TidyTagId tag, int maxDepth = -1)
|
||||
// The same as the above function, only it returns the first node for a single tag.
|
||||
TidyNode getFirstTagNode(TidyTagId tag, int maxDepth = -1)
|
||||
{
|
||||
QString result;
|
||||
TidyNode result = nullptr;
|
||||
traverseNode(
|
||||
tidyGetRoot(doc_),
|
||||
{tag},
|
||||
[&result](const QString& value, TidyTagId) { result = value; },
|
||||
[&result](TidyNode node, TidyTagId) { result = node; },
|
||||
maxDepth);
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
void traverseNode(TidyNode node,
|
||||
QList<TidyTagId> tags,
|
||||
const std::function<void(const QString&, TidyTagId)>& cb,
|
||||
int depth = -1)
|
||||
// Extract the text value from a node.
|
||||
QString getNodeText(TidyNode node)
|
||||
{
|
||||
TidyBuffer nodeValue = {};
|
||||
if (!node || tidyNodeGetText(doc_, node, &nodeValue) != yes) {
|
||||
return QString();
|
||||
}
|
||||
QString result = QString::fromUtf8((char*) nodeValue.bp, nodeValue.size);
|
||||
tidyBufFree(&nodeValue);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Extract the attribute value from a node.
|
||||
QString getNodeAttr(TidyNode node, TidyAttrId attrId)
|
||||
{
|
||||
TidyAttr attr = tidyAttrGetById(node, attrId);
|
||||
if (!attr) {
|
||||
return QString();
|
||||
}
|
||||
const auto* attrValue = tidyAttrValue(attr);
|
||||
if (!attrValue) {
|
||||
return QString();
|
||||
}
|
||||
return QString::fromLocal8Bit(attrValue);
|
||||
}
|
||||
|
||||
// Extract the inner HTML of a node.
|
||||
QString getNodeInnerHtml(TidyNode node)
|
||||
{
|
||||
if (!node) {
|
||||
return QString();
|
||||
}
|
||||
const auto* child = tidyGetChild(node);
|
||||
return child ? getNodeText(child) : QString();
|
||||
}
|
||||
|
||||
QString getTagInnerHtml(TidyTagId tag)
|
||||
{
|
||||
return getNodeInnerHtml(getFirstTagNode(tag));
|
||||
}
|
||||
|
||||
private:
|
||||
// NOLINTNEXTLINE(misc-no-recursion)
|
||||
void traverseNode(TidyNode node,
|
||||
const QList<TidyTagId>& tags,
|
||||
const std::function<void(TidyNode, TidyTagId)>& cb,
|
||||
int depth = -1)
|
||||
{
|
||||
for (auto tag : tags) {
|
||||
if (tidyNodeGetId(node) == tag && tidyNodeGetText(doc_, node, &nodeValue) == yes && cb) {
|
||||
cb(QString::fromLocal8Bit(nodeValue.bp), tag);
|
||||
if (tidyNodeGetId(node) == tag && cb) {
|
||||
cb(node, tag);
|
||||
if (depth != -1 && --depth == 0) {
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -25,6 +25,18 @@
|
|||
|
||||
#include "md4c-html.h"
|
||||
|
||||
namespace {
|
||||
// A callback function that will be called by the md4c library (`md_html`) to output the HTML.
|
||||
void
|
||||
htmlChunkCb(const MD_CHAR* data, MD_SIZE data_size, void* userData)
|
||||
{
|
||||
QByteArray* array = static_cast<QByteArray*>(userData);
|
||||
if (data_size > 0) {
|
||||
array->append(data, int(data_size));
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
MessageParser::MessageParser(PreviewEngine* previewEngine, QObject* parent)
|
||||
: QObject(parent)
|
||||
, previewEngine_(previewEngine)
|
||||
|
@ -51,9 +63,9 @@ MessageParser::parseMessage(const QString& messageId,
|
|||
// Now that we have the HTML, we can parse it to get a list of tags and their values.
|
||||
// We are only interested in the <a> and <pre> tags.
|
||||
htmlParser_->parseHtmlString(html);
|
||||
auto tagsMap = htmlParser_->getTags({TidyTag_A, TidyTag_DEL, TidyTag_PRE});
|
||||
auto tagsMap = htmlParser_->getTagsNodes({TidyTag_A, TidyTag_DEL, TidyTag_PRE});
|
||||
|
||||
static QString styleTag("<style>%1</style>");
|
||||
static const QString styleTag("<style>%1</style>");
|
||||
QString style;
|
||||
|
||||
// Check for any <pre> tags. If there are any, we need to:
|
||||
|
@ -89,11 +101,9 @@ MessageParser::parseMessage(const QString& messageId,
|
|||
// If the user has enabled link previews, then we need to generate the link preview.
|
||||
if (previewLinks) {
|
||||
// Get the first link in the message.
|
||||
auto anchorTag = tagsMap[TidyTag_A].first();
|
||||
static QRegularExpression hrefRegex("href=\"(.*?)\"");
|
||||
auto match = hrefRegex.match(anchorTag);
|
||||
if (match.hasMatch()) {
|
||||
Q_EMIT previewEngine_->parseLink(messageId, match.captured(1));
|
||||
auto href = htmlParser_->getNodeAttr(tagsMap[TidyTag_A].first(), TidyAttr_HREF);
|
||||
if (!href.isEmpty()) {
|
||||
Q_EMIT previewEngine_->parseLink(messageId, href);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -110,13 +120,13 @@ void
|
|||
MessageParser::preprocessMarkdown(QString& markdown)
|
||||
{
|
||||
// Match all instances of the linefeed character.
|
||||
static QRegularExpression newlineRegex("\n");
|
||||
static const QRegularExpression newlineRegex("\\r?\\n");
|
||||
static const QString newline = " \n";
|
||||
|
||||
// Replace all instances of the linefeed character with 2 spaces + a linefeed character
|
||||
// in order to force a line break in the HTML.
|
||||
// Note: we should only do this for non-code fenced blocks.
|
||||
static QRegularExpression codeFenceRe("`{1,3}([\\s\\S]*?)`{1,3}");
|
||||
static const QRegularExpression codeFenceRe("`{1,3}([\\s\\S]*?)`{1,3}");
|
||||
auto match = codeFenceRe.globalMatch(markdown);
|
||||
|
||||
// If there are no code blocks, then we can just replace all linefeeds with 2 spaces
|
||||
|
@ -132,7 +142,7 @@ MessageParser::preprocessMarkdown(QString& markdown)
|
|||
enum BlockType { Text, Code };
|
||||
QVector<QPair<BlockType, QString>> codeBlocks;
|
||||
|
||||
int start = 0;
|
||||
qsizetype start = 0;
|
||||
while (match.hasNext()) {
|
||||
auto m = match.next();
|
||||
auto nonCodelength = m.capturedStart() - start;
|
||||
|
@ -158,27 +168,16 @@ MessageParser::preprocessMarkdown(QString& markdown)
|
|||
}
|
||||
}
|
||||
|
||||
// A callback function that will be called by the md4c library (`md_html`) to output the HTML.
|
||||
static void
|
||||
htmlChunkCb(const MD_CHAR* data, MD_SIZE data_size, void* userData)
|
||||
{
|
||||
QByteArray* array = static_cast<QByteArray*>(userData);
|
||||
if (data_size > 0) {
|
||||
array->append(data, int(data_size));
|
||||
}
|
||||
};
|
||||
|
||||
QString
|
||||
MessageParser::markdownToHtml(const char* markdown)
|
||||
{
|
||||
static auto md_flags = MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_NOINDENTEDCODEBLOCKS
|
||||
| MD_FLAG_TASKLISTS | MD_FLAG_STRIKETHROUGH | MD_FLAG_UNDERLINE;
|
||||
size_t data_len = strlen(markdown);
|
||||
const size_t data_len = strlen(markdown);
|
||||
if (data_len <= 0) {
|
||||
return QString();
|
||||
} else {
|
||||
QByteArray array;
|
||||
int result = md_html(markdown, MD_SIZE(data_len), &htmlChunkCb, &array, md_flags, 0);
|
||||
return result == 0 ? QString::fromUtf8(array) : QString();
|
||||
}
|
||||
QByteArray array;
|
||||
const int result = md_html(markdown, MD_SIZE(data_len), &htmlChunkCb, &array, md_flags, 0);
|
||||
return result == 0 ? QString::fromUtf8(array) : QString();
|
||||
}
|
||||
|
|
|
@ -19,15 +19,6 @@
|
|||
|
||||
#include <QRegularExpression>
|
||||
|
||||
static QString
|
||||
getInnerHtml(const QString& tag)
|
||||
{
|
||||
static const QRegularExpression re(">([^<]+)<");
|
||||
const auto match = re.match(tag);
|
||||
return match.hasMatch() ? match.captured(1) : QString {};
|
||||
};
|
||||
|
||||
// Portable newline regex.
|
||||
const QRegularExpression PreviewEngine::newlineRe("\\r?\\n");
|
||||
|
||||
PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent)
|
||||
|
@ -39,12 +30,11 @@ PreviewEngine::PreviewEngine(ConnectivityMonitor* cm, QObject* parent)
|
|||
}
|
||||
|
||||
QString
|
||||
PreviewEngine::getTagContent(QList<QString>& tags, const QString& value)
|
||||
PreviewEngine::getTagContent(const QList<QString>& tags, const QString& value)
|
||||
{
|
||||
Q_FOREACH (auto tag, tags) {
|
||||
const QRegularExpression re("(property|name)=\"(og:|twitter:|)" + value
|
||||
+ "\".*?content=\"([^\"]+)\"");
|
||||
|
||||
const auto match = re.match(tag.remove(newlineRe));
|
||||
if (match.hasMatch()) {
|
||||
return match.captured(3);
|
||||
|
@ -54,45 +44,44 @@ PreviewEngine::getTagContent(QList<QString>& tags, const QString& value)
|
|||
}
|
||||
|
||||
QString
|
||||
PreviewEngine::getTitle(HtmlParser::TagInfoList& metaTags)
|
||||
PreviewEngine::getTitle(const QList<QString>& metaTags)
|
||||
{
|
||||
// Try with opengraph/twitter props
|
||||
QString title = getTagContent(metaTags[TidyTag_META], "title");
|
||||
QString title = getTagContent(metaTags, "title");
|
||||
if (title.isEmpty()) { // Try with title tag
|
||||
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_TITLE));
|
||||
title = htmlParser_->getTagInnerHtml(TidyTag_TITLE);
|
||||
}
|
||||
if (title.isEmpty()) { // Try with h1 tag
|
||||
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H1));
|
||||
title = htmlParser_->getTagInnerHtml(TidyTag_H1);
|
||||
}
|
||||
if (title.isEmpty()) { // Try with h2 tag
|
||||
title = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_H2));
|
||||
title = htmlParser_->getTagInnerHtml(TidyTag_H2);
|
||||
}
|
||||
return title;
|
||||
}
|
||||
|
||||
QString
|
||||
PreviewEngine::getDescription(HtmlParser::TagInfoList& metaTags)
|
||||
PreviewEngine::getDescription(const QList<QString>& metaTags)
|
||||
{
|
||||
// Try with og/twitter props
|
||||
QString d = getTagContent(metaTags[TidyTag_META], "description");
|
||||
if (d.isEmpty()) { // Try with first paragraph
|
||||
d = getInnerHtml(htmlParser_->getFirstTagValue(TidyTag_P));
|
||||
QString desc = getTagContent(metaTags, "description");
|
||||
if (desc.isEmpty()) { // Try with first paragraph
|
||||
desc = htmlParser_->getTagInnerHtml(TidyTag_P);
|
||||
}
|
||||
return d;
|
||||
return desc;
|
||||
}
|
||||
|
||||
QString
|
||||
PreviewEngine::getImage(HtmlParser::TagInfoList& metaTags)
|
||||
PreviewEngine::getImage(const QList<QString>& metaTags)
|
||||
{
|
||||
// Try with og/twitter props
|
||||
QString image = getTagContent(metaTags[TidyTag_META], "image");
|
||||
QString image = getTagContent(metaTags, "image");
|
||||
if (image.isEmpty()) { // Try with href of link tag (rel="image_src")
|
||||
auto tags = htmlParser_->getTags({TidyTag_LINK});
|
||||
Q_FOREACH (auto tag, tags[TidyTag_LINK]) {
|
||||
static const QRegularExpression re("rel=\"image_src\".*?href=\"([^\"]+)\"");
|
||||
const auto match = re.match(tag.remove(newlineRe));
|
||||
if (match.hasMatch()) {
|
||||
return match.captured(1);
|
||||
auto tagsNodes = htmlParser_->getTagsNodes({TidyTag_LINK});
|
||||
Q_FOREACH (auto tag, tagsNodes[TidyTag_LINK]) {
|
||||
QString href = htmlParser_->getNodeAttr(tag, TidyAttr_HREF);
|
||||
if (!href.isEmpty()) {
|
||||
return href;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -104,7 +93,12 @@ PreviewEngine::onParseLink(const QString& messageId, const QString& link)
|
|||
{
|
||||
sendGetRequest(QUrl(link), [this, messageId, link](const QByteArray& html) {
|
||||
htmlParser_->parseHtmlString(html);
|
||||
auto metaTags = htmlParser_->getTags({TidyTag_META});
|
||||
auto tagsNodes = htmlParser_->getTagsNodes({TidyTag_META});
|
||||
auto metaTagNodes = tagsNodes[TidyTag_META];
|
||||
QList<QString> metaTags;
|
||||
Q_FOREACH (auto tag, metaTagNodes) {
|
||||
metaTags.append(htmlParser_->getNodeText(tag));
|
||||
}
|
||||
QString domain = QUrl(link).host();
|
||||
if (domain.isEmpty()) {
|
||||
domain = link;
|
||||
|
|
|
@ -39,10 +39,10 @@ private:
|
|||
// An instance of HtmlParser used to parse HTML.
|
||||
HtmlParser* htmlParser_;
|
||||
|
||||
QString getTagContent(QList<QString>& tags, const QString& value);
|
||||
QString getTitle(HtmlParser::TagInfoList& metaTags);
|
||||
QString getDescription(HtmlParser::TagInfoList& metaTags);
|
||||
QString getImage(HtmlParser::TagInfoList& metaTags);
|
||||
QString getTagContent(const QList<QString>& tags, const QString& value);
|
||||
QString getTitle(const QList<QString>& metaTags);
|
||||
QString getDescription(const QList<QString>& metaTags);
|
||||
QString getImage(const QList<QString>& metaTags);
|
||||
|
||||
static const QRegularExpression newlineRe;
|
||||
};
|
||||
|
|
|
@ -117,7 +117,6 @@ TEST_F(MessageParserFixture, EndOfLineCharactersAreParsedCorrectly)
|
|||
auto backgroundColor = QColor::fromRgb(0, 0, 255);
|
||||
|
||||
QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
|
||||
QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
|
||||
|
||||
// Parse a message with a link.
|
||||
globalEnv.messageParser->parseMessage("msgId_03",
|
||||
|
@ -148,7 +147,6 @@ TEST_F(MessageParserFixture, FencedCodeIsParsedCorrectly)
|
|||
auto backgroundColor = QColor::fromRgb(0, 0, 255);
|
||||
|
||||
QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
|
||||
QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
|
||||
|
||||
// Parse a message with a link.
|
||||
globalEnv.messageParser->parseMessage("msgId_04",
|
||||
|
@ -169,3 +167,41 @@ TEST_F(MessageParserFixture, FencedCodeIsParsedCorrectly)
|
|||
"<style>pre,code{background-color:#0000ff;color:#ffffff;white-space:pre-wrap;"
|
||||
"}</style><p>Text with</p>\n<pre><code>code\n</code></pre>\n");
|
||||
}
|
||||
|
||||
/*!
|
||||
* WHEN We parse a text body with a youtube link.
|
||||
* THEN PreviewEngine::parseLink should be called with the correct arguments.
|
||||
*/
|
||||
TEST_F(MessageParserFixture, YoutubeLinkIsParsedCorrectly)
|
||||
{
|
||||
auto url = "https://www.youtube.com/watch?v=1234567890";
|
||||
auto msg = "blah blah " + QString(url) + " blah blah";
|
||||
|
||||
QSignalSpy messageParsedSpy(globalEnv.messageParser.data(), &MessageParser::messageParsed);
|
||||
QSignalSpy linkInfoReadySpy(globalEnv.messageParser.data(), &MessageParser::linkInfoReady);
|
||||
|
||||
// Parse a message with a link.
|
||||
globalEnv.messageParser->parseMessage("msgId_05",
|
||||
msg,
|
||||
true,
|
||||
QColor::fromRgb(0, 0, 255),
|
||||
QColor::fromRgb(0, 0, 255));
|
||||
|
||||
// Wait for the messageParsed signal which should be emitted once.
|
||||
messageParsedSpy.wait();
|
||||
EXPECT_EQ(messageParsedSpy.count(), 1);
|
||||
|
||||
QList<QVariant> messageParserArguments = messageParsedSpy.takeFirst();
|
||||
EXPECT_TRUE(messageParserArguments.at(0).typeId() == qMetaTypeId<QString>());
|
||||
|
||||
// Wait for the linkInfoReady signal which should be emitted once.
|
||||
linkInfoReadySpy.wait();
|
||||
EXPECT_EQ(linkInfoReadySpy.count(), 1);
|
||||
|
||||
QList<QVariant> linkInfoReadyArguments = linkInfoReadySpy.takeFirst();
|
||||
EXPECT_TRUE(linkInfoReadyArguments.at(0).typeId() == qMetaTypeId<QString>());
|
||||
EXPECT_EQ(linkInfoReadyArguments.at(0).toString(), "msgId_05");
|
||||
EXPECT_TRUE(linkInfoReadyArguments.at(1).typeId() == qMetaTypeId<QVariantMap>());
|
||||
QVariantMap linkInfo = linkInfoReadyArguments.at(1).toMap();
|
||||
EXPECT_EQ(linkInfo["url"].toString(), url);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue