diff --git a/api/config/settings/common.py b/api/config/settings/common.py index d67a15290fb0b6f829665af699578aed96ba99c7..2ed9aa4b649d35131db889513047652d277f1ca5 100644 --- a/api/config/settings/common.py +++ b/api/config/settings/common.py @@ -942,3 +942,5 @@ MODERATION_EMAIL_NOTIFICATIONS_ENABLED = env.bool( # Delay in days after signup before we show the "support us" messages INSTANCE_SUPPORT_MESSAGE_DELAY = env.int("INSTANCE_SUPPORT_MESSAGE_DELAY", default=15) FUNKWHALE_SUPPORT_MESSAGE_DELAY = env.int("FUNKWHALE_SUPPORT_MESSAGE_DELAY", default=15) +# XXX Stable release: remove +USE_FULL_TEXT_SEARCH = env.bool("USE_FULL_TEXT_SEARCH", default=True) diff --git a/api/funkwhale_api/common/fields.py b/api/funkwhale_api/common/fields.py index f206b626eaa5e9f939bafb8adaeb6dec64177eb1..ffc8e5f95d42171542b2cda6e863a9f157026e30 100644 --- a/api/funkwhale_api/common/fields.py +++ b/api/funkwhale_api/common/fields.py @@ -1,5 +1,6 @@ import django_filters from django import forms +from django.conf import settings from django.core.serializers.json import DjangoJSONEncoder from django.db import models @@ -33,12 +34,18 @@ def privacy_level_query(user, lookup_field="privacy_level", user_field="user"): class SearchFilter(django_filters.CharFilter): def __init__(self, *args, **kwargs): self.search_fields = kwargs.pop("search_fields") + self.fts_search_fields = kwargs.pop("fts_search_fields", []) super().__init__(*args, **kwargs) def filter(self, qs, value): if not value: return qs - query = search.get_query(value, self.search_fields) + if settings.USE_FULL_TEXT_SEARCH and self.fts_search_fields: + query = search.get_fts_query( + value, self.fts_search_fields, model=self.parent.Meta.model + ) + else: + query = search.get_query(value, self.search_fields) return qs.filter(query) diff --git a/api/funkwhale_api/common/search.py b/api/funkwhale_api/common/search.py index 4e42fd3462a1ddaa35f27a4c90f4bffd6a74f465..6a4091d7def5a40d23a8cd57d2aaaa7be3cf874f 100644 --- a/api/funkwhale_api/common/search.py +++ b/api/funkwhale_api/common/search.py @@ -1,7 +1,10 @@ import re +from django.contrib.postgres.search import SearchQuery from django.db.models import Q +from . import utils + QUERY_REGEX = re.compile(r'(((?P<key>\w+):)?(?P<value>"[^"]+"|[\S]+))') @@ -56,6 +59,41 @@ def get_query(query_string, search_fields): return query +def get_fts_query(query_string, fts_fields=["body_text"], model=None): + if query_string.startswith('"') and query_string.endswith('"'): + # we pass the query directly to the FTS engine + query_string = query_string[1:-1] + else: + parts = query_string.replace(":", "").split(" ") + parts = ["{}:*".format(p) for p in parts if p] + if not parts: + return Q(pk=None) + + query_string = "&".join(parts) + + if not fts_fields or not query_string.strip(): + return Q(pk=None) + query = None + for field in fts_fields: + if "__" in field and model: + # When we have a nested lookup, we switch to a subquery for enhanced performance + fk_field_name, lookup = ( + field.split("__")[0], + "__".join(field.split("__")[1:]), + ) + fk_field = model._meta.get_field(fk_field_name) + related_model = fk_field.related_model + subquery = related_model.objects.filter( + **{lookup: SearchQuery(query_string, search_type="raw")} + ).values_list("pk", flat=True) + new_query = Q(**{"{}__in".format(fk_field_name): list(subquery)}) + else: + new_query = Q(**{field: SearchQuery(query_string, search_type="raw")}) + query = utils.join_queries_or(query, new_query) + + return query + + def filter_tokens(tokens, valid): return [t for t in tokens if t["key"] in valid] diff --git a/api/funkwhale_api/common/utils.py b/api/funkwhale_api/common/utils.py index 55e4fdfbd8f2ae282ca35e187bb2515da3af6902..4bb18c403a4d536a7d7245b38d21d4fac78a782b 100644 --- a/api/funkwhale_api/common/utils.py +++ b/api/funkwhale_api/common/utils.py @@ -234,3 +234,10 @@ def get_updated_fields(conf, data, obj): final_data[obj_field] = data_value return final_data + + +def join_queries_or(left, right): + if left: + return left | right + else: + return right diff --git a/api/funkwhale_api/federation/activity.py b/api/funkwhale_api/federation/activity.py index 2bac41887c696edec23f795e274e3e78ee7e8e58..b8d04164c120a50b31ced79a4b71116cd705e397 100644 --- a/api/funkwhale_api/federation/activity.py +++ b/api/funkwhale_api/federation/activity.py @@ -511,13 +511,6 @@ def prepare_deliveries_and_inbox_items(recipient_list, type, allowed_domains=Non return inbox_items, deliveries, urls -def join_queries_or(left, right): - if left: - return left | right - else: - return right - - def get_actors_from_audience(urls): """ Given a list of urls such as [ @@ -539,22 +532,24 @@ def get_actors_from_audience(urls): if url == PUBLIC_ADDRESS: continue queries["actors"].append(url) - queries["followed"] = join_queries_or( + queries["followed"] = funkwhale_utils.join_queries_or( queries["followed"], Q(target__followers_url=url) ) final_query = None if queries["actors"]: - final_query = join_queries_or(final_query, Q(fid__in=queries["actors"])) + final_query = funkwhale_utils.join_queries_or( + final_query, Q(fid__in=queries["actors"]) + ) if queries["followed"]: actor_follows = models.Follow.objects.filter(queries["followed"], approved=True) - final_query = join_queries_or( + final_query = funkwhale_utils.join_queries_or( final_query, Q(pk__in=actor_follows.values_list("actor", flat=True)) ) library_follows = models.LibraryFollow.objects.filter( queries["followed"], approved=True ) - final_query = join_queries_or( + final_query = funkwhale_utils.join_queries_or( final_query, Q(pk__in=library_follows.values_list("actor", flat=True)) ) if not final_query: diff --git a/api/funkwhale_api/music/filters.py b/api/funkwhale_api/music/filters.py index 75ce9ec850d6a00dfd64457207835d98d4bb423c..64b44c8c992fae79cef4b2e9de305a882fb0ad5f 100644 --- a/api/funkwhale_api/music/filters.py +++ b/api/funkwhale_api/music/filters.py @@ -23,7 +23,7 @@ TAG_FILTER = common_filters.MultipleQueryFilter(method=filter_tags) class ArtistFilter( audio_filters.IncludeChannelsFilterSet, moderation_filters.HiddenContentFilterSet ): - q = fields.SearchFilter(search_fields=["name"]) + q = fields.SearchFilter(search_fields=["name"], fts_search_fields=["body_text"]) playable = filters.BooleanFilter(field_name="_", method="filter_playable") tag = TAG_FILTER scope = common_filters.ActorScopeFilter( @@ -49,7 +49,10 @@ class ArtistFilter( class TrackFilter( audio_filters.IncludeChannelsFilterSet, moderation_filters.HiddenContentFilterSet ): - q = fields.SearchFilter(search_fields=["title", "album__title", "artist__name"]) + q = fields.SearchFilter( + search_fields=["title", "album__title", "artist__name"], + fts_search_fields=["body_text", "artist__body_text", "album__body_text"], + ) playable = filters.BooleanFilter(field_name="_", method="filter_playable") tag = TAG_FILTER id = common_filters.MultipleQueryFilter(coerce=int) @@ -127,7 +130,10 @@ class AlbumFilter( audio_filters.IncludeChannelsFilterSet, moderation_filters.HiddenContentFilterSet ): playable = filters.BooleanFilter(field_name="_", method="filter_playable") - q = fields.SearchFilter(search_fields=["title", "artist__name"]) + q = fields.SearchFilter( + search_fields=["title", "artist__name"], + fts_search_fields=["body_text", "artist__body_text"], + ) tag = TAG_FILTER scope = common_filters.ActorScopeFilter( actor_field="tracks__uploads__library__actor", distinct=True diff --git a/api/funkwhale_api/music/migrations/0044_full_text_search.py b/api/funkwhale_api/music/migrations/0044_full_text_search.py new file mode 100644 index 0000000000000000000000000000000000000000..e44df90d92f99080939a017c6877ea8579d6dc56 --- /dev/null +++ b/api/funkwhale_api/music/migrations/0044_full_text_search.py @@ -0,0 +1,109 @@ +# Generated by Django 2.2.7 on 2019-12-16 15:06 + +import django.contrib.postgres.search +import django.contrib.postgres.indexes +from django.db import migrations, models +import django.db.models.deletion +from django.db import connection + +FIELDS = { + "music.Artist": { + "fields": [ + 'name', + ], + "trigger_name": "music_artist_update_body_text" + }, + "music.Track": { + "fields": ['title', 'copyright'], + "trigger_name": "music_track_update_body_text" + }, + "music.Album": { + "fields": ['title'], + "trigger_name": "music_album_update_body_text" + }, +} + +def populate_body_text(apps, schema_editor): + for label, search_config in FIELDS.items(): + model = apps.get_model(*label.split('.')) + print('Populating search index for {}…'.format(model.__name__)) + vector = django.contrib.postgres.search.SearchVector(*search_config['fields']) + model.objects.update(body_text=vector) + +def rewind(apps, schema_editor): + pass + +def setup_triggers(apps, schema_editor): + cursor = connection.cursor() + for label, search_config in FIELDS.items(): + model = apps.get_model(*label.split('.')) + table = model._meta.db_table + print('Creating database trigger {} on {}…'.format(search_config['trigger_name'], table)) + sql = """ + CREATE TRIGGER {trigger_name} + BEFORE INSERT OR UPDATE + ON {table} + FOR EACH ROW + EXECUTE PROCEDURE + tsvector_update_trigger(body_text, 'pg_catalog.english', {fields}) + """.format( + trigger_name=search_config['trigger_name'], + table=table, + fields=', '.join(search_config['fields']), + ) + print(sql) + cursor.execute(sql) + +def rewind_triggers(apps, schema_editor): + cursor = connection.cursor() + for label, search_config in FIELDS.items(): + model = apps.get_model(*label.split('.')) + table = model._meta.db_table + print('Dropping database trigger {} on {}…'.format(search_config['trigger_name'], table)) + sql = """ + DROP TRIGGER IF EXISTS {trigger_name} ON {table} + """.format( + trigger_name=search_config['trigger_name'], + table=table, + ) + + cursor.execute(sql) + +class Migration(migrations.Migration): + + dependencies = [ + ('music', '0043_album_cover_attachment'), + ] + + operations = [ + migrations.AddField( + model_name='album', + name='body_text', + field=django.contrib.postgres.search.SearchVectorField(blank=True), + ), + migrations.AddField( + model_name='artist', + name='body_text', + field=django.contrib.postgres.search.SearchVectorField(blank=True), + ), + migrations.AddField( + model_name='track', + name='body_text', + field=django.contrib.postgres.search.SearchVectorField(blank=True), + ), + migrations.AddIndex( + model_name='album', + index=django.contrib.postgres.indexes.GinIndex(fields=['body_text'], name='music_album_body_te_0ec97a_gin'), + ), + migrations.AddIndex( + model_name='artist', + index=django.contrib.postgres.indexes.GinIndex(fields=['body_text'], name='music_artis_body_te_5c408d_gin'), + ), + migrations.AddIndex( + model_name='track', + index=django.contrib.postgres.indexes.GinIndex(fields=['body_text'], name='music_track_body_te_da0a66_gin'), + ), + + migrations.RunPython(setup_triggers, rewind_triggers), + migrations.RunPython(populate_body_text, rewind), + ] diff --git a/api/funkwhale_api/music/models.py b/api/funkwhale_api/music/models.py index f53e8e463e7d198ada7590231b8f77275b55252a..424a307ebbfbc5f2c1ef18031d01e3f2460a22fd 100644 --- a/api/funkwhale_api/music/models.py +++ b/api/funkwhale_api/music/models.py @@ -11,6 +11,8 @@ import pydub from django.conf import settings from django.contrib.contenttypes.fields import GenericRelation from django.contrib.postgres.fields import JSONField +from django.contrib.postgres.search import SearchVectorField +from django.contrib.postgres.indexes import GinIndex from django.core.exceptions import ObjectDoesNotExist from django.core.files.base import ContentFile from django.core.serializers.json import DjangoJSONEncoder @@ -19,7 +21,6 @@ from django.db.models.signals import post_save, pre_save from django.dispatch import receiver from django.urls import reverse from django.utils import timezone - from versatileimagefield.fields import VersatileImageField from funkwhale_api import musicbrainz @@ -56,10 +57,14 @@ class APIModelMixin(models.Model): api_includes = [] creation_date = models.DateTimeField(default=timezone.now, db_index=True) import_hooks = [] + body_text = SearchVectorField(blank=True) class Meta: abstract = True ordering = ["-creation_date"] + indexes = [ + GinIndex(fields=["body_text"]), + ] @classmethod def get_or_create_from_api(cls, mbid): @@ -171,7 +176,12 @@ class ArtistQuerySet(common_models.LocalFromFidQuerySet, models.QuerySet): def with_albums(self): return self.prefetch_related( - models.Prefetch("albums", queryset=Album.objects.with_tracks_count()) + models.Prefetch( + "albums", + queryset=Album.objects.with_tracks_count().select_related( + "attachment_cover", "attributed_to" + ), + ) ) def annotate_playable_by_actor(self, actor): @@ -524,6 +534,9 @@ class Track(APIModelMixin): class Meta: ordering = ["album", "disc_number", "position"] + indexes = [ + GinIndex(fields=["body_text"]), + ] def __str__(self): return self.title diff --git a/api/funkwhale_api/music/utils.py b/api/funkwhale_api/music/utils.py index 09c8cbd122a6099cee8a0e9874fc1fe0ce1eea37..d576ad0672c10706186e0cd727c5091d15c72ddb 100644 --- a/api/funkwhale_api/music/utils.py +++ b/api/funkwhale_api/music/utils.py @@ -4,7 +4,9 @@ import magic import mutagen import pydub -from funkwhale_api.common.search import normalize_query, get_query # noqa +from funkwhale_api.common.search import get_fts_query # noqa +from funkwhale_api.common.search import get_query # noqa +from funkwhale_api.common.search import normalize_query # noqa def guess_mimetype(f): diff --git a/api/funkwhale_api/music/views.py b/api/funkwhale_api/music/views.py index 57d009e99a586bcb3d21c92dbd65c957657720a0..a85607a6941418cef675dd2ac727c5372e646be2 100644 --- a/api/funkwhale_api/music/views.py +++ b/api/funkwhale_api/music/views.py @@ -6,6 +6,7 @@ import urllib.parse from django.conf import settings from django.db import transaction from django.db.models import Count, Prefetch, Sum, F, Q +import django.db.utils from django.utils import timezone from rest_framework import mixins @@ -606,20 +607,30 @@ class Search(views.APIView): anonymous_policy = "setting" def get(self, request, *args, **kwargs): - query = request.GET["query"] - results = { - # 'tags': serializers.TagSerializer(self.get_tags(query), many=True).data, - "artists": serializers.ArtistWithAlbumsSerializer( - self.get_artists(query), many=True - ).data, - "tracks": serializers.TrackSerializer( - self.get_tracks(query), many=True - ).data, - "albums": serializers.AlbumSerializer( - self.get_albums(query), many=True - ).data, - "tags": TagSerializer(self.get_tags(query), many=True).data, - } + query = request.GET.get("query", request.GET.get("q", "")) or "" + query = query.strip() + if not query: + return Response({"detail": "empty query"}, status=400) + try: + results = { + # 'tags': serializers.TagSerializer(self.get_tags(query), many=True).data, + "artists": serializers.ArtistWithAlbumsSerializer( + self.get_artists(query), many=True + ).data, + "tracks": serializers.TrackSerializer( + self.get_tracks(query), many=True + ).data, + "albums": serializers.AlbumSerializer( + self.get_albums(query), many=True + ).data, + "tags": TagSerializer(self.get_tags(query), many=True).data, + } + except django.db.utils.ProgrammingError as e: + if "in tsquery:" in str(e): + return Response({"detail": "Invalid query"}, status=400) + else: + raise + return Response(results, status=200) def get_tracks(self, query): @@ -629,28 +640,58 @@ class Search(views.APIView): "album__title__unaccent", "artist__name__unaccent", ] - query_obj = utils.get_query(query, search_fields) + if settings.USE_FULL_TEXT_SEARCH: + query_obj = utils.get_fts_query( + query, + fts_fields=["body_text", "album__body_text", "artist__body_text"], + model=models.Track, + ) + else: + query_obj = utils.get_query(query, search_fields) qs = ( models.Track.objects.all() .filter(query_obj) - .prefetch_related("artist", "album__artist") + .prefetch_related( + "artist", + "attributed_to", + Prefetch( + "album", + queryset=models.Album.objects.select_related( + "artist", "attachment_cover", "attributed_to" + ), + ), + ) ) return common_utils.order_for_search(qs, "title")[: self.max_results] def get_albums(self, query): search_fields = ["mbid", "title__unaccent", "artist__name__unaccent"] - query_obj = utils.get_query(query, search_fields) + if settings.USE_FULL_TEXT_SEARCH: + query_obj = utils.get_fts_query( + query, fts_fields=["body_text", "artist__body_text"], model=models.Album + ) + else: + query_obj = utils.get_query(query, search_fields) qs = ( models.Album.objects.all() .filter(query_obj) - .prefetch_related("tracks__artist", "artist", "attributed_to") + .select_related("artist", "attachment_cover", "attributed_to") + .prefetch_related("tracks__artist") ) return common_utils.order_for_search(qs, "title")[: self.max_results] def get_artists(self, query): search_fields = ["mbid", "name__unaccent"] - query_obj = utils.get_query(query, search_fields) - qs = models.Artist.objects.all().filter(query_obj).with_albums() + if settings.USE_FULL_TEXT_SEARCH: + query_obj = utils.get_fts_query(query, model=models.Artist) + else: + query_obj = utils.get_query(query, search_fields) + qs = ( + models.Artist.objects.all() + .filter(query_obj) + .with_albums() + .select_related("attributed_to") + ) return common_utils.order_for_search(qs, "name")[: self.max_results] def get_tags(self, query): diff --git a/api/tests/music/test_triggers.py b/api/tests/music/test_triggers.py new file mode 100644 index 0000000000000000000000000000000000000000..e8f5f53a9335b534e8617fa3a34cd5bf3154a5c8 --- /dev/null +++ b/api/tests/music/test_triggers.py @@ -0,0 +1,50 @@ +import pytest + +from django.db import connection + + +@pytest.mark.parametrize( + "factory_name,fields", + [ + ("music.Artist", ["name"]), + ("music.Album", ["title"]), + ("music.Track", ["title"]), + ], +) +def test_body_text_trigger_creation(factory_name, fields, factories): + obj = factories[factory_name]() + obj.refresh_from_db() + cursor = connection.cursor() + sql = """ + SELECT to_tsvector('{indexed_text}') + """.format( + indexed_text=" ".join([getattr(obj, f) for f in fields if getattr(obj, f)]), + ) + cursor.execute(sql) + + assert cursor.fetchone()[0] == obj.body_text + + +@pytest.mark.parametrize( + "factory_name,fields", + [ + ("music.Artist", ["name"]), + ("music.Album", ["title"]), + ("music.Track", ["title"]), + ], +) +def test_body_text_trigger_updaten(factory_name, fields, factories, faker): + obj = factories[factory_name]() + for field in fields: + setattr(obj, field, faker.sentence()) + obj.save() + obj.refresh_from_db() + cursor = connection.cursor() + sql = """ + SELECT to_tsvector('{indexed_text}') + """.format( + indexed_text=" ".join([getattr(obj, f) for f in fields if getattr(obj, f)]), + ) + cursor.execute(sql) + + assert cursor.fetchone()[0] == obj.body_text diff --git a/api/tests/music/test_views.py b/api/tests/music/test_views.py index c56aeee66d23bb75ce5bf3097410b76e5be13df0..7eb71c6d3c1cc9f3716d77e2baa81f0a25785703 100644 --- a/api/tests/music/test_views.py +++ b/api/tests/music/test_views.py @@ -1193,3 +1193,48 @@ def test_get_upload_audio_metadata(logged_in_api_client, factories): assert response.status_code == 200 assert serializer.is_valid(raise_exception=True) is True assert response.data == serializer.validated_data + + +@pytest.mark.parametrize("use_fts", [True, False]) +def test_search_get(use_fts, settings, logged_in_api_client, factories): + settings.USE_FULL_TEXT_SEARCH = use_fts + artist = factories["music.Artist"](name="Foo Fighters") + album = factories["music.Album"](title="Foo Bar") + track = factories["music.Track"](title="Foo Baz") + tag = factories["tags.Tag"](name="Foo") + + factories["music.Track"]() + factories["tags.Tag"]() + + url = reverse("api:v1:search") + expected = { + "artists": [serializers.ArtistWithAlbumsSerializer(artist).data], + "albums": [serializers.AlbumSerializer(album).data], + "tracks": [serializers.TrackSerializer(track).data], + "tags": [views.TagSerializer(tag).data], + } + response = logged_in_api_client.get(url, {"q": "foo"}) + + assert response.status_code == 200 + assert response.data == expected + + +def test_search_get_fts_advanced(settings, logged_in_api_client, factories): + settings.USE_FULL_TEXT_SEARCH = True + artist1 = factories["music.Artist"](name="Foo Bighters") + artist2 = factories["music.Artist"](name="Bar Fighter") + factories["music.Artist"]() + + url = reverse("api:v1:search") + expected = { + "artists": serializers.ArtistWithAlbumsSerializer( + [artist2, artist1], many=True + ).data, + "albums": [], + "tracks": [], + "tags": [], + } + response = logged_in_api_client.get(url, {"q": '"foo | bar"'}) + + assert response.status_code == 200 + assert response.data == expected diff --git a/changes/changelog.d/994.enhancement b/changes/changelog.d/994.enhancement new file mode 100644 index 0000000000000000000000000000000000000000..d4c467c39d5ed6005f24d676896577b87d05597f --- /dev/null +++ b/changes/changelog.d/994.enhancement @@ -0,0 +1 @@ +Replaced our slow research logic by PostgreSQL full-text search (#994) diff --git a/changes/notes.rst b/changes/notes.rst index 7a84fce9ac88328ea817773f780d615ff951411d..1323a70eea69cbc57a5e66d3c14e07b15bf443f7 100644 --- a/changes/notes.rst +++ b/changes/notes.rst @@ -6,6 +6,20 @@ Next release notes Those release notes refer to the current development branch and are reset after each release. +Improved search performance +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Our search engine went through a full rewrite to make it faster. This new engine is enabled +by default when using the search bar, or when searching for artists, albums and tracks. It leverages +PostgreSQL full-text search capabilities. + +During our tests, we observed huge performance improvements after the switch, by an order of +magnitude. This should be especially perceptible on pods with large databases, more modest hardware +or hard drives. + +We plan to remove the old engine in an upcoming release. In the meantime, if anything goes wrong, +you can switch back by setting ``USE_FULL_TEXT_SEARCH=false`` in your ``.env`` file. + User management through the server CLI ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^