From 76e0856793d7723578899aac4692433128006abc Mon Sep 17 00:00:00 2001 From: Eliot Berriot <contact@eliotberriot.com> Date: Sat, 25 May 2019 15:53:23 +0200 Subject: [PATCH] Added url extraction from activitypub profiles --- retribute_api/search/activitypub.py | 41 +++++++++++++++++++++++++++++ setup.cfg | 2 ++ tests/search/test_activitypub.py | 41 +++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 retribute_api/search/activitypub.py create mode 100644 tests/search/test_activitypub.py diff --git a/retribute_api/search/activitypub.py b/retribute_api/search/activitypub.py new file mode 100644 index 0000000..552541f --- /dev/null +++ b/retribute_api/search/activitypub.py @@ -0,0 +1,41 @@ +import lxml.html +from rest_framework import serializers + + +def get_urls(content): + links = [] + dom = lxml.html.fromstring(content) + for link in dom.xpath("//a/@href"): + links.append(link) + return links + + +def extract_urls_from_attachments(attachments): + data = [] + for attachment in attachments: + if attachment["type"] != "PropertyValue": + continue + + urls = get_urls(attachment["value"]) + if not urls: + continue + row = {"summary": attachment["name"], "url": urls[0]} + data.append(row) + + return data + + +class TagSerializer(serializers.Serializer): + name = serializers.CharField() + type = serializers.CharField() + + +class AttachmentSerializer(serializers.Serializer): + type = serializers.CharField() + name = serializers.CharField() + value = serializers.CharField() + + +class ActorSerializer(serializers.Serializer): + attachment = serializers.ListField(child=AttachmentSerializer(), min_length=1) + tag = serializers.ListField(child=TagSerializer(), min_length=0) diff --git a/setup.cfg b/setup.cfg index ec78d9b..5902555 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,6 +31,8 @@ install_requires = django-redis djangorestframework psycopg2-binary + lxml + [options.entry_points] diff --git a/tests/search/test_activitypub.py b/tests/search/test_activitypub.py new file mode 100644 index 0000000..e3b6c6b --- /dev/null +++ b/tests/search/test_activitypub.py @@ -0,0 +1,41 @@ +from retribute_api.search import activitypub + + +def test_profile_serializer(): + payload = { + "tag": [{"type": "Hashtag", "name": "#nobot"}], + "attachment": [ + { + "type": "PropertyValue", + "name": "patreon", + "value": '<a href="https://patreon.com/username" rel="me nofollow noopener">Test</a>', + } + ], + } + serializer = activitypub.ActorSerializer(data=payload) + assert serializer.is_valid(raise_exception=True) is True + + assert serializer.validated_data == payload + + +def test_extract_urls_from_attachments(): + attachments = [ + { + "type": "PropertyValue", + "name": "Support me on Patreon", + "value": '<a href="https://patreon.com/username" rel="me nofollow noopener" target="_blank">This is my Patreon</a>', + }, + { + "type": "PropertyValue", + "name": "Support me on Ko-Fi", + "value": '<a href="https://ko-fi.com/username" rel="me nofollow noopener" target="_blank">This is my Ko-Fi</a>', + }, + {"type": "PropertyValue", "name": "Irrelevant text", "value": "No link"}, + ] + + expected = [ + {"summary": "Support me on Patreon", "url": "https://patreon.com/username"}, + {"summary": "Support me on Ko-Fi", "url": "https://ko-fi.com/username"}, + ] + + assert activitypub.extract_urls_from_attachments(attachments) == expected -- GitLab