Unverified Commit 752c993e authored by Agate's avatar Agate 💬

Importer updates: watch directories, handle metadata updates

parent 2b5a2b39
......@@ -1309,3 +1309,6 @@ IGNORE_FORWARDED_HOST_AND_PROTO = env.bool(
"""
Use :attr:`FUNKWHALE_HOSTNAME` and :attr:`FUNKWHALE_PROTOCOL ` instead of request header.
"""
HASHING_ALGORITHM = "sha256"
HASHING_CHUNK_SIZE = 1024 * 100
import datetime
import hashlib
from django.core.files.base import ContentFile
from django.http import request
......@@ -458,3 +459,19 @@ def monkey_patch_request_build_absolute_uri():
request.HttpRequest.scheme = property(scheme)
request.HttpRequest.get_host = get_host
def get_file_hash(file, algo=None, chunk_size=None, full_read=False):
algo = algo or settings.HASHING_ALGORITHM
chunk_size = chunk_size or settings.HASHING_CHUNK_SIZE
handler = getattr(hashlib, algo)
hash = handler()
file.seek(0)
if full_read:
for byte_block in iter(lambda: file.read(chunk_size), b""):
hash.update(byte_block)
else:
# sometimes, it's useful to only hash the beginning of the file, e.g
# to avoid a lot of I/O when crawling large libraries
hash.update(file.read(chunk_size))
return "{}:{}".format(algo, hash.hexdigest())
......@@ -2,6 +2,7 @@ from django.core.management.base import BaseCommand
from django.db import transaction
from django.db.models import Q
from funkwhale_api.common import utils as common_utils
from funkwhale_api.music import models, utils
......@@ -17,9 +18,9 @@ class Command(BaseCommand):
help="Do not execute anything",
)
parser.add_argument(
"--mimetypes",
"--mimetype",
action="store_true",
dest="mimetypes",
dest="mimetype",
default=True,
help="Check and fix mimetypes",
)
......@@ -37,16 +38,33 @@ class Command(BaseCommand):
default=False,
help="Check and fix file size, can be really slow because it needs to access files",
)
parser.add_argument(
"--checksum",
action="store_true",
dest="checksum",
default=False,
help="Check and fix file size, can be really slow because it needs to access files",
)
parser.add_argument(
"--batch-size",
"-s",
dest="batch_size",
default=1000,
type=int,
help="Size of each updated batch",
)
def handle(self, *args, **options):
if options["dry_run"]:
self.stdout.write("Dry-run on, will not commit anything")
if options["mimetypes"]:
if options["mimetype"]:
self.fix_mimetypes(**options)
if options["data"]:
self.fix_file_data(**options)
if options["size"]:
self.fix_file_size(**options)
if options["checksum"]:
self.fix_file_checksum(**options)
@transaction.atomic
def fix_mimetypes(self, dry_run, **kwargs):
......@@ -54,11 +72,12 @@ class Command(BaseCommand):
matching = models.Upload.objects.filter(
Q(source__startswith="file://") | Q(source__startswith="upload://")
).exclude(mimetype__startswith="audio/")
total = matching.count()
self.stdout.write(
"[mimetypes] {} entries found with bad or no mimetype".format(
matching.count()
)
"[mimetypes] {} entries found with bad or no mimetype".format(total)
)
if not total:
return
for extension, mimetype in utils.EXTENSION_TO_MIMETYPE.items():
qs = matching.filter(source__endswith=".{}".format(extension))
self.stdout.write(
......@@ -81,24 +100,36 @@ class Command(BaseCommand):
)
if dry_run:
return
for i, upload in enumerate(matching.only("audio_file")):
self.stdout.write(
"[bitrate/length] {}/{} fixing file #{}".format(i + 1, total, upload.pk)
)
try:
audio_file = upload.get_audio_file()
if audio_file:
chunks = common_utils.chunk_queryset(
matching.only("id", "audio_file", "source"), kwargs["batch_size"]
)
handled = 0
for chunk in chunks:
updated = []
for upload in chunk:
handled += 1
self.stdout.write(
"[bitrate/length] {}/{} fixing file #{}".format(
handled, total, upload.pk
)
)
try:
audio_file = upload.get_audio_file()
data = utils.get_audio_file_data(audio_file)
upload.bitrate = data["bitrate"]
upload.duration = data["length"]
upload.save(update_fields=["duration", "bitrate"])
except Exception as e:
self.stderr.write(
"[bitrate/length] error with file #{}: {}".format(
upload.pk, str(e)
)
)
else:
self.stderr.write("[bitrate/length] no file found")
except Exception as e:
self.stderr.write(
"[bitrate/length] error with file #{}: {}".format(upload.pk, str(e))
)
updated.append(upload)
models.Upload.objects.bulk_update(updated, ["bitrate", "duration"])
def fix_file_size(self, dry_run, **kwargs):
self.stdout.write("Fixing missing size...")
......@@ -107,15 +138,64 @@ class Command(BaseCommand):
self.stdout.write("[size] {} entries found with missing values".format(total))
if dry_run:
return
for i, upload in enumerate(matching.only("size")):
self.stdout.write(
"[size] {}/{} fixing file #{}".format(i + 1, total, upload.pk)
)
try:
upload.size = upload.get_file_size()
upload.save(update_fields=["size"])
except Exception as e:
self.stderr.write(
"[size] error with file #{}: {}".format(upload.pk, str(e))
chunks = common_utils.chunk_queryset(
matching.only("id", "audio_file", "source"), kwargs["batch_size"]
)
handled = 0
for chunk in chunks:
updated = []
for upload in chunk:
handled += 1
self.stdout.write(
"[size] {}/{} fixing file #{}".format(handled, total, upload.pk)
)
try:
upload.size = upload.get_file_size()
except Exception as e:
self.stderr.write(
"[size] error with file #{}: {}".format(upload.pk, str(e))
)
else:
updated.append(upload)
models.Upload.objects.bulk_update(updated, ["size"])
def fix_file_checksum(self, dry_run, **kwargs):
self.stdout.write("Fixing missing checksums...")
matching = models.Upload.objects.filter(
Q(checksum=None)
& (Q(audio_file__isnull=False) | Q(source__startswith="file://"))
)
total = matching.count()
self.stdout.write(
"[checksum] {} entries found with missing values".format(total)
)
if dry_run:
return
chunks = common_utils.chunk_queryset(
matching.only("id", "audio_file", "source"), kwargs["batch_size"]
)
handled = 0
for chunk in chunks:
updated = []
for upload in chunk:
handled += 1
self.stdout.write(
"[checksum] {}/{} fixing file #{}".format(handled, total, upload.pk)
)
try:
upload.checksum = common_utils.get_file_hash(
upload.get_audio_file()
)
except Exception as e:
self.stderr.write(
"[checksum] error with file #{}: {}".format(upload.pk, str(e))
)
else:
updated.append(upload)
models.Upload.objects.bulk_update(updated, ["checksum"])
# Generated by Django 3.0.4 on 2020-05-05 08:10
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('music', '0051_auto_20200319_1249'),
]
operations = [
migrations.AddField(
model_name='upload',
name='checksum',
field=models.CharField(blank=True, db_index=True, max_length=100, null=True),
),
migrations.AlterField(
model_name='uploadversion',
name='mimetype',
field=models.CharField(choices=[('audio/mp3', 'mp3'), ('audio/mpeg3', 'mp3'), ('audio/x-mp3', 'mp3'), ('audio/mpeg', 'mp3'), ('video/ogg', 'ogg'), ('audio/ogg', 'ogg'), ('audio/opus', 'opus'), ('audio/x-m4a', 'aac'), ('audio/x-m4a', 'm4a'), ('audio/x-flac', 'flac'), ('audio/flac', 'flac')], max_length=50),
),
]
......@@ -655,6 +655,14 @@ class Track(APIModelMixin):
class UploadQuerySet(common_models.NullsLastQuerySet):
def in_place(self, include=True):
query = models.Q(source__startswith="file://") & (
models.Q(audio_file="") | models.Q(audio_file=None)
)
if not include:
query = ~query
return self.filter(query)
def playable_by(self, actor, include=True):
libraries = Library.objects.viewable_by(actor)
......@@ -754,6 +762,9 @@ class Upload(models.Model):
)
downloads_count = models.PositiveIntegerField(default=0)
# stores checksums such as `sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855`
checksum = models.CharField(max_length=100, db_index=True, null=True, blank=True)
objects = UploadQuerySet.as_manager()
@property
......@@ -833,7 +844,7 @@ class Upload(models.Model):
def get_audio_file(self):
if self.audio_file:
return self.audio_file.open()
if self.source.startswith("file://"):
if self.source and self.source.startswith("file://"):
return open(self.source.replace("file://", "", 1), "rb")
def get_audio_data(self):
......@@ -866,6 +877,15 @@ class Upload(models.Model):
self.mimetype = mimetypes.guess_type(self.source)[0]
if not self.size and self.audio_file:
self.size = self.audio_file.size
if not self.checksum:
try:
audio_file = self.get_audio_file()
except FileNotFoundError:
pass
else:
if audio_file:
self.checksum = common_utils.get_file_hash(audio_file)
if not self.pk and not self.fid and self.library.actor.get_user():
self.fid = self.get_federation_id()
return super().save(**kwargs)
......
......@@ -851,3 +851,71 @@ def update_library_entity(obj, data):
obj.save(update_fields=list(data.keys()))
return obj
UPDATE_CONFIG = {
"track": {
"position": {},
"title": {},
"mbid": {},
"disc_number": {},
"copyright": {},
"license": {
"getter": lambda data, field: licenses.match(
data.get("license"), data.get("copyright")
)
},
},
"album": {"title": {}, "mbid": {}, "release_date": {}},
"artist": {"name": {}, "mbid": {}},
"album_artist": {"name": {}, "mbid": {}},
}
@transaction.atomic
def update_track_metadata(audio_metadata, track):
# XXX: implement this to support updating metadata when an imported files
# is updated by an outside tool (e.g beets).
serializer = metadata.TrackMetadataSerializer(data=audio_metadata)
serializer.is_valid(raise_exception=True)
new_data = serializer.validated_data
to_update = [
("track", track, lambda data: data),
("album", track.album, lambda data: data["album"]),
("artist", track.artist, lambda data: data["artists"][0]),
(
"album_artist",
track.album.artist if track.album else None,
lambda data: data["album"]["artists"][0],
),
]
for id, obj, data_getter in to_update:
if not obj:
continue
obj_updated_fields = []
try:
obj_data = data_getter(new_data)
except IndexError:
continue
for field, config in UPDATE_CONFIG[id].items():
getter = config.get(
"getter", lambda data, field: data[config.get("field", field)]
)
try:
new_value = getter(obj_data, field)
except KeyError:
continue
old_value = getattr(obj, field)
if new_value == old_value:
continue
obj_updated_fields.append(field)
setattr(obj, field, new_value)
if obj_updated_fields:
obj.save(update_fields=obj_updated_fields)
if track.album and "album" in new_data and new_data["album"].get("cover_data"):
common_utils.attach_file(
track.album, "attachment_cover", new_data["album"].get("cover_data")
)
......@@ -83,3 +83,4 @@ service_identity==18.1.0
markdown>=3.2,<4
bleach>=3,<4
feedparser==6.0.0b3
watchdog==0.10.2
......@@ -258,3 +258,12 @@ def test_monkey_patch_request_build_absolute_uri(
request = fake_request.get("/", **meta)
assert request.build_absolute_uri(path) == expected
def test_get_file_hash(tmpfile, settings):
settings.HASHING_ALGORITHM = "sha256"
content = b"hello"
tmpfile.write(content)
# echo -n "hello" | sha256sum
expected = "sha256:2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824"
assert utils.get_file_hash(tmpfile) == expected
import os
import pytest
from funkwhale_api.common import utils as common_utils
from funkwhale_api.music.management.commands import check_inplace_files
from funkwhale_api.music.management.commands import fix_uploads
from funkwhale_api.music.management.commands import prune_library
......@@ -18,7 +19,7 @@ def test_fix_uploads_bitrate_length(factories, mocker):
return_value={"bitrate": 42, "length": 43},
)
c.fix_file_data(dry_run=False)
c.fix_file_data(dry_run=False, batch_size=100)
upload1.refresh_from_db()
upload2.refresh_from_db()
......@@ -41,7 +42,7 @@ def test_fix_uploads_size(factories, mocker):
mocker.patch("funkwhale_api.music.models.Upload.get_file_size", return_value=2)
c.fix_file_size(dry_run=False)
c.fix_file_size(dry_run=False, batch_size=100)
upload1.refresh_from_db()
upload2.refresh_from_db()
......@@ -69,7 +70,7 @@ def test_fix_uploads_mimetype(factories, mocker):
mimetype="audio/something",
)
c = fix_uploads.Command()
c.fix_mimetypes(dry_run=False)
c.fix_mimetypes(dry_run=False, batch_size=100)
upload1.refresh_from_db()
upload2.refresh_from_db()
......@@ -78,6 +79,25 @@ def test_fix_uploads_mimetype(factories, mocker):
assert upload2.mimetype == "audio/something"
def test_fix_uploads_checksum(factories, mocker):
upload1 = factories["music.Upload"]()
upload2 = factories["music.Upload"]()
upload1.__class__.objects.filter(pk=upload1.pk).update(checksum="test")
upload2.__class__.objects.filter(pk=upload2.pk).update(checksum=None)
c = fix_uploads.Command()
c.fix_file_checksum(dry_run=False, batch_size=100)
upload1.refresh_from_db()
upload2.refresh_from_db()
# not updated
assert upload1.checksum == "test"
# updated
assert upload2.checksum == common_utils.get_file_hash(upload2.audio_file)
def test_prune_library_dry_run(factories):
prunable = factories["music.Track"]()
not_prunable = factories["music.Track"]()
......
......@@ -5,6 +5,7 @@ import pytest
from django.utils import timezone
from django.urls import reverse
from funkwhale_api.common import utils as common_utils
from funkwhale_api.music import importers, models, tasks
from funkwhale_api.federation import utils as federation_utils
......@@ -164,6 +165,17 @@ def test_audio_track_mime_type(extention, mimetype, factories):
assert upload.mimetype == mimetype
@pytest.mark.parametrize("name", ["test.ogg", "test.mp3"])
def test_audio_track_checksum(name, factories):
path = os.path.join(DATA_DIR, name)
upload = factories["music.Upload"](audio_file__from_path=path, mimetype=None)
with open(path, "rb") as f:
expected = common_utils.get_file_hash(f)
assert upload.checksum == expected
def test_upload_file_name(factories):
name = "test.mp3"
path = os.path.join(DATA_DIR, name)
......
......@@ -1329,3 +1329,40 @@ def test_can_import_track_with_same_position_in_same_discs_skipped(factories, mo
new_upload.refresh_from_db()
assert new_upload.import_status == "skipped"
def test_update_track_metadata(factories):
track = factories["music.Track"]()
data = {
"title": "Peer Gynt Suite no. 1, op. 46: I. Morning",
"artist": "Edvard Grieg",
"album_artist": "Edvard Grieg; Musopen Symphony Orchestra",
"album": "Peer Gynt Suite no. 1, op. 46",
"date": "2012-08-15",
"position": "4",
"disc_number": "2",
"musicbrainz_albumid": "a766da8b-8336-47aa-a3ee-371cc41ccc75",
"mbid": "bd21ac48-46d8-4e78-925f-d9cc2a294656",
"musicbrainz_artistid": "013c8e5b-d72a-4cd3-8dee-6c64d6125823",
"musicbrainz_albumartistid": "013c8e5b-d72a-4cd3-8dee-6c64d6125823;5b4d7d2d-36df-4b38-95e3-a964234f520f",
"license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/",
"copyright": "Someone",
"comment": "hello there",
}
tasks.update_track_metadata(metadata.FakeMetadata(data), track)
track.refresh_from_db()
assert track.title == data["title"]
assert track.position == int(data["position"])
assert track.disc_number == int(data["disc_number"])
assert track.license.code == "cc-by-sa-4.0"
assert track.copyright == data["copyright"]
assert str(track.mbid) == data["mbid"]
assert track.album.title == data["album"]
assert track.album.release_date == datetime.date(2012, 8, 15)
assert str(track.album.mbid) == data["musicbrainz_albumid"]
assert track.artist.name == data["artist"]
assert str(track.artist.mbid) == data["musicbrainz_artistid"]
assert track.album.artist.name == "Edvard Grieg"
assert str(track.album.artist.mbid) == "013c8e5b-d72a-4cd3-8dee-6c64d6125823"
......@@ -4,6 +4,8 @@ import pytest
from django.core.management import call_command
from django.core.management.base import CommandError
from funkwhale_api.common import utils as common_utils
from funkwhale_api.music.management.commands import import_files
DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "files")
......@@ -159,3 +161,194 @@ def test_import_files_in_place(factories, mocker, settings):
def test_storage_rename_utf_8_files(factories):
upload = factories["music.Upload"](audio_file__filename="été.ogg")
assert upload.audio_file.name.endswith("ete.ogg")
@pytest.mark.parametrize("name", ["modified", "moved", "created", "deleted"])
def test_handle_event(name, mocker):
handler = mocker.patch.object(import_files, "handle_{}".format(name))
event = {"type": name}
stdout = mocker.Mock()
kwargs = {"hello": "world"}
import_files.handle_event(event, stdout, **kwargs)
handler.assert_called_once_with(event=event, stdout=stdout, **kwargs)
def test_handle_created(mocker):
handle_modified = mocker.patch.object(import_files, "handle_modified")
event = mocker.Mock()
stdout = mocker.Mock()
kwargs = {"hello": "world"}
import_files.handle_created(event, stdout, **kwargs)
handle_modified.assert_called_once_with(event, stdout, **kwargs)
def test_handle_deleted(factories, mocker):
stdout = mocker.Mock()
event = {
"path": "/path.mp3",
}
library = factories["music.Library"]()
deleted = factories["music.Upload"](
library=library,
source="file://{}".format(event["path"]),
import_status="finished",
audio_file=None,
)
kept = [
factories["music.Upload"](
library=library,
source="file://{}".format(event["path"]),
import_status="finished",
),
factories["music.Upload"](
source="file://{}".format(event["path"]),
import_status="finished",
audio_file=None,
),
]
import_files.handle_deleted(
event=event, stdout=stdout, library=library, in_place=True
)
with pytest.raises(deleted.DoesNotExist):
deleted.refresh_from_db()
for upload in kept:
upload.refresh_from_db()
def test_handle_moved(factories, mocker):
stdout = mocker.Mock()
event = {
"src_path": "/path.mp3",
"dest_path": "/new_path.mp3",
}
library = factories["music.Library"]()
updated = factories["music.Upload"](
library=library,
source="file://{}".format(event["src_path"]),
import_status="finished",
audio_file=None,
)
untouched = [
factories["music.Upload"](
library=library,
source="file://{}".format(event["src_path"]),
import_status="finished",
),
factories["music.Upload"](
source="file://{}".format(event["src_path"]),
import_status="finished",