From 76e0856793d7723578899aac4692433128006abc Mon Sep 17 00:00:00 2001
From: Eliot Berriot <contact@eliotberriot.com>
Date: Sat, 25 May 2019 15:53:23 +0200
Subject: [PATCH] Added url extraction from  activitypub profiles

---
 retribute_api/search/activitypub.py | 41 +++++++++++++++++++++++++++++
 setup.cfg                           |  2 ++
 tests/search/test_activitypub.py    | 41 +++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+)
 create mode 100644 retribute_api/search/activitypub.py
 create mode 100644 tests/search/test_activitypub.py

diff --git a/retribute_api/search/activitypub.py b/retribute_api/search/activitypub.py
new file mode 100644
index 0000000..552541f
--- /dev/null
+++ b/retribute_api/search/activitypub.py
@@ -0,0 +1,41 @@
+import lxml.html
+from rest_framework import serializers
+
+
+def get_urls(content):
+    links = []
+    dom = lxml.html.fromstring(content)
+    for link in dom.xpath("//a/@href"):
+        links.append(link)
+    return links
+
+
+def extract_urls_from_attachments(attachments):
+    data = []
+    for attachment in attachments:
+        if attachment["type"] != "PropertyValue":
+            continue
+
+        urls = get_urls(attachment["value"])
+        if not urls:
+            continue
+        row = {"summary": attachment["name"], "url": urls[0]}
+        data.append(row)
+
+    return data
+
+
+class TagSerializer(serializers.Serializer):
+    name = serializers.CharField()
+    type = serializers.CharField()
+
+
+class AttachmentSerializer(serializers.Serializer):
+    type = serializers.CharField()
+    name = serializers.CharField()
+    value = serializers.CharField()
+
+
+class ActorSerializer(serializers.Serializer):
+    attachment = serializers.ListField(child=AttachmentSerializer(), min_length=1)
+    tag = serializers.ListField(child=TagSerializer(), min_length=0)
diff --git a/setup.cfg b/setup.cfg
index ec78d9b..5902555 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,6 +31,8 @@ install_requires =
     django-redis
     djangorestframework
     psycopg2-binary
+    lxml
+
 
 
 [options.entry_points]
diff --git a/tests/search/test_activitypub.py b/tests/search/test_activitypub.py
new file mode 100644
index 0000000..e3b6c6b
--- /dev/null
+++ b/tests/search/test_activitypub.py
@@ -0,0 +1,41 @@
+from retribute_api.search import activitypub
+
+
+def test_profile_serializer():
+    payload = {
+        "tag": [{"type": "Hashtag", "name": "#nobot"}],
+        "attachment": [
+            {
+                "type": "PropertyValue",
+                "name": "patreon",
+                "value": '<a href="https://patreon.com/username" rel="me nofollow noopener">Test</a>',
+            }
+        ],
+    }
+    serializer = activitypub.ActorSerializer(data=payload)
+    assert serializer.is_valid(raise_exception=True) is True
+
+    assert serializer.validated_data == payload
+
+
+def test_extract_urls_from_attachments():
+    attachments = [
+        {
+            "type": "PropertyValue",
+            "name": "Support me on Patreon",
+            "value": '<a href="https://patreon.com/username" rel="me nofollow noopener" target="_blank">This is my Patreon</a>',
+        },
+        {
+            "type": "PropertyValue",
+            "name": "Support me on Ko-Fi",
+            "value": '<a href="https://ko-fi.com/username" rel="me nofollow noopener" target="_blank">This is my Ko-Fi</a>',
+        },
+        {"type": "PropertyValue", "name": "Irrelevant text", "value": "No link"},
+    ]
+
+    expected = [
+        {"summary": "Support me on Patreon", "url": "https://patreon.com/username"},
+        {"summary": "Support me on Ko-Fi", "url": "https://ko-fi.com/username"},
+    ]
+
+    assert activitypub.extract_urls_from_attachments(attachments) == expected
-- 
GitLab