diff --git a/api/funkwhale_api/music/management/commands/check_inplace_files.py b/api/funkwhale_api/music/management/commands/check_inplace_files.py index f274ee589086f5446ef83c7791c33688c072d014..d3c36212a3d4610fb72ebaea2750ee56536bd07f 100644 --- a/api/funkwhale_api/music/management/commands/check_inplace_files.py +++ b/api/funkwhale_api/music/management/commands/check_inplace_files.py @@ -47,7 +47,7 @@ class Command(BaseCommand): self.stdout.write("Checking {} in-place imported files…".format(total)) missing = [] - for i, row in enumerate(candidates.values("id", "source")): + for i, row in enumerate(candidates.values("id", "source").iterator()): path = row["source"].replace("file://", "") progress(self.stdout, i + 1, total) if not os.path.exists(path): diff --git a/api/funkwhale_api/music/management/commands/import_files.py b/api/funkwhale_api/music/management/commands/import_files.py index 33f8ed2d1f6fb10c299465fa752efb4c058a4dd4..2bd6fb9e429eb0d994c576dbe870405a7ab28cc4 100644 --- a/api/funkwhale_api/music/management/commands/import_files.py +++ b/api/funkwhale_api/music/management/commands/import_files.py @@ -1,13 +1,40 @@ -import glob +import itertools import os import urllib.parse +import time from django.conf import settings from django.core.files import File from django.core.management.base import BaseCommand, CommandError from django.utils import timezone -from funkwhale_api.music import models, tasks +from funkwhale_api.music import models, tasks, utils + + +def crawl_dir(dir, extensions, recursive=True): + if os.path.isfile(dir): + yield dir + return + with os.scandir(dir) as scanner: + for entry in scanner: + if entry.is_file(): + for e in extensions: + if entry.name.lower().endswith(".{}".format(e.lower())): + yield entry.path + elif recursive and entry.is_dir(): + yield from crawl_dir(entry, extensions, recursive=recursive) + + +def batch(iterable, n=1): + has_entries = True + while has_entries: + current = [] + for i in range(0, n): + try: + current.append(next(iterable)) + except StopIteration: + has_entries = False + yield current class Command(BaseCommand): @@ -89,6 +116,7 @@ class Command(BaseCommand): "of overhead on your server and on servers you are federating with." ), ) + parser.add_argument("-e", "--extension", nargs="+") parser.add_argument( "--broadcast", @@ -119,10 +147,17 @@ class Command(BaseCommand): help="Do NOT prompt the user for input of any kind.", ) - def handle(self, *args, **options): - glob_kwargs = {} - matching = [] + parser.add_argument( + "--batch-size", + "-s", + dest="batch_size", + default=1000, + type=int, + help="Size of each batch, only used when crawling large collections", + ) + def handle(self, *args, **options): + self.is_confirmed = False try: library = models.Library.objects.select_related("actor__user").get( uuid__startswith=options["library_id"] @@ -133,14 +168,100 @@ class Command(BaseCommand): if not library.actor.get_user(): raise CommandError("Library {} is not a local library".format(library.uuid)) - if options["recursive"]: - glob_kwargs["recursive"] = True - for import_path in options["path"]: - matching += glob.glob(import_path, **glob_kwargs) - raw_matching = sorted(list(set(matching))) + if options["in_place"]: + self.stdout.write( + "Checking imported paths against settings.MUSIC_DIRECTORY_PATH" + ) + + for import_path in options["path"]: + p = settings.MUSIC_DIRECTORY_PATH + if not p: + raise CommandError( + "Importing in-place requires setting the " + "MUSIC_DIRECTORY_PATH variable" + ) + if p and not import_path.startswith(p): + raise CommandError( + "Importing in-place only works if importing" + "from {} (MUSIC_DIRECTORY_PATH), as this directory" + "needs to be accessible by the webserver." + "Culprit: {}".format(p, import_path) + ) + + extensions = options.get("extension") or utils.SUPPORTED_EXTENSIONS + crawler = itertools.chain( + *[ + crawl_dir(p, extensions=extensions, recursive=options["recursive"]) + for p in options["path"] + ] + ) + errors = [] + total = 0 + start_time = time.time() + reference = options["reference"] or "cli-{}".format(timezone.now().isoformat()) + + import_url = "{}://{}/content/libraries/{}/upload?{}" + import_url = import_url.format( + settings.FUNKWHALE_PROTOCOL, + settings.FUNKWHALE_HOSTNAME, + str(library.uuid), + urllib.parse.urlencode([("import", reference)]), + ) + self.stdout.write( + "For details, please refer to import reference '{}' or URL {}".format( + reference, import_url + ) + ) + batch_start = None + batch_duration = None + for i, entries in enumerate(batch(crawler, options["batch_size"])): + total += len(entries) + batch_start = time.time() + time_stats = "" + if i > 0: + time_stats = " - running for {}s, previous batch took {}s".format( + int(time.time() - start_time), int(batch_duration), + ) + if entries: + self.stdout.write( + "Handling batch {} ({} items){}".format( + i + 1, options["batch_size"], time_stats, + ) + ) + batch_errors = self.handle_batch( + library=library, + paths=entries, + batch=i + 1, + reference=reference, + options=options, + ) + if batch_errors: + errors += batch_errors + + batch_duration = time.time() - batch_start + + message = "Successfully imported {} tracks in {}s" + if options["async_"]: + message = "Successfully launched import for {} tracks in {}s" + + self.stdout.write( + message.format(total - len(errors), int(time.time() - start_time)) + ) + if len(errors) > 0: + self.stderr.write("{} tracks could not be imported:".format(len(errors))) + + for path, error in errors: + self.stderr.write("- {}: {}".format(path, error)) + self.stdout.write( + "For details, please refer to import reference '{}' or URL {}".format( + reference, import_url + ) + ) + + def handle_batch(self, library, paths, batch, reference, options): matching = [] - for m in raw_matching: + for m in paths: # In some situations, the path is encoded incorrectly on the filesystem # so we filter out faulty paths and display a warning to the user. # see https://dev.funkwhale.audio/funkwhale/funkwhale/issues/138 @@ -160,96 +281,57 @@ class Command(BaseCommand): ) ) - if options["in_place"]: - self.stdout.write( - "Checking imported paths against settings.MUSIC_DIRECTORY_PATH" - ) - p = settings.MUSIC_DIRECTORY_PATH - if not p: - raise CommandError( - "Importing in-place requires setting the " - "MUSIC_DIRECTORY_PATH variable" - ) - for m in matching: - if not m.startswith(p): - raise CommandError( - "Importing in-place only works if importing" - "from {} (MUSIC_DIRECTORY_PATH), as this directory" - "needs to be accessible by the webserver." - "Culprit: {}".format(p, m) - ) if not matching: raise CommandError("No file matching pattern, aborting") if options["replace"]: filtered = {"initial": matching, "skipped": [], "new": matching} - message = "- {} files to be replaced" + message = " - {} files to be replaced" import_paths = matching else: filtered = self.filter_matching(matching, library) - message = "- {} files already found in database" + message = " - {} files already found in database" import_paths = filtered["new"] - self.stdout.write("Import summary:") + self.stdout.write(" Import summary:") self.stdout.write( - "- {} files found matching this pattern: {}".format( + " - {} files found matching this pattern: {}".format( len(matching), options["path"] ) ) self.stdout.write(message.format(len(filtered["skipped"]))) - self.stdout.write("- {} new files".format(len(filtered["new"]))) + self.stdout.write(" - {} new files".format(len(filtered["new"]))) - self.stdout.write( - "Selected options: {}".format( - ", ".join(["in place" if options["in_place"] else "copy music files"]) + if batch == 1: + self.stdout.write( + " Selected options: {}".format( + ", ".join( + ["in place" if options["in_place"] else "copy music files"] + ) + ) ) - ) if len(filtered["new"]) == 0: - self.stdout.write("Nothing new to import, exiting") + self.stdout.write(" Nothing new to import, exiting") return - if options["interactive"]: + if options["interactive"] and not self.is_confirmed: message = ( "Are you sure you want to do this?\n\n" "Type 'yes' to continue, or 'no' to cancel: " ) if input("".join(message)) != "yes": raise CommandError("Import cancelled.") - reference = options["reference"] or "cli-{}".format(timezone.now().isoformat()) - - import_url = "{}://{}/content/libraries/{}/upload?{}" - import_url = import_url.format( - settings.FUNKWHALE_PROTOCOL, - settings.FUNKWHALE_HOSTNAME, - str(library.uuid), - urllib.parse.urlencode([("import", reference)]), - ) - self.stdout.write( - "For details, please refer to import reference '{}' or URL {}".format( - reference, import_url - ) - ) + self.is_confirmed = True errors = self.do_import( - import_paths, library=library, reference=reference, options=options - ) - message = "Successfully imported {} tracks" - if options["async_"]: - message = "Successfully launched import for {} tracks" - - self.stdout.write(message.format(len(import_paths))) - if len(errors) > 0: - self.stderr.write("{} tracks could not be imported:".format(len(errors))) - - for path, error in errors: - self.stderr.write("- {}: {}".format(path, error)) - - self.stdout.write( - "For details, please refer to import reference '{}' or URL {}".format( - reference, import_url - ) + import_paths, + library=library, + reference=reference, + batch=batch, + options=options, ) + return errors def filter_matching(self, matching, library): sources = ["file://{}".format(p) for p in matching] @@ -266,17 +348,20 @@ class Command(BaseCommand): } return result - def do_import(self, paths, library, reference, options): - message = "{i}/{total} Importing {path}..." + def do_import(self, paths, library, reference, batch, options): + message = "[batch {batch}] {i}/{total} Importing {path}..." if options["async_"]: - message = "{i}/{total} Launching import for {path}..." + message = "[batch {batch}] {i}/{total} Launching import for {path}..." # we create an upload binded to the library async_ = options["async_"] errors = [] for i, path in list(enumerate(paths)): + if options["verbosity"] > 1: + self.stdout.write( + message.format(batch=batch, path=path, i=i + 1, total=len(paths)) + ) try: - self.stdout.write(message.format(path=path, i=i + 1, total=len(paths))) self.create_upload( path, reference, diff --git a/changes/changelog.d/importer.enhancement b/changes/changelog.d/importer.enhancement new file mode 100644 index 0000000000000000000000000000000000000000..f18d70117dbdbff9ac5d04b27fcf8addc1a7801d --- /dev/null +++ b/changes/changelog.d/importer.enhancement @@ -0,0 +1 @@ +CLI Importer is now more reliable and less resource-hungry on large libraries diff --git a/changes/notes.rst b/changes/notes.rst index 96ac3d7651f92166072a2fb200c0dd57606851e3..a1dbb5590d5c2e85a3a2eafb2aa1f90bdb3bae36 100644 --- a/changes/notes.rst +++ b/changes/notes.rst @@ -5,3 +5,20 @@ Next release notes Those release notes refer to the current development branch and are reset after each release. + +More reliable CLI importer +-------------------------- + +Our CLI importer is now more reliable and less prone to Out-of-Memory issues, especially when scanning large libraries. (hundreds of GB or bigger) + +We've also improved the directory crawling logic, so that you don't have to use glob patterns or specify extensions when importing. + +This means you can replace scripts that look like this:: + + python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/**/*.ogg" "/srv/funkwhale/data/music/**/*.mp3" --recursive --noinput + +By this: + + python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput + +And Funkwhale will happily import any supported audio file from the specified directory. diff --git a/docs/admin/importing-music.rst b/docs/admin/importing-music.rst index aea601681ab66031e6c0bddce6fc123ba6a96a1a..bc0a642cbf6d651ec869e9c38716e4cf8b86c664 100644 --- a/docs/admin/importing-music.rst +++ b/docs/admin/importing-music.rst @@ -15,7 +15,7 @@ You can import those tracks as follows, assuming they are located in .. code-block:: bash export LIBRARY_ID="<your_libary_id>" - python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/**/*.ogg" --recursive --noinput + python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput When you use docker, the ``/srv/funkwhale/data/music`` is mounted from the host to the ``/music`` directory on the container: @@ -23,14 +23,14 @@ to the ``/music`` directory on the container: .. code-block:: bash export LIBRARY_ID="<your_libary_id>" - docker-compose run --rm api python manage.py import_files $LIBRARY_ID "/music/**/*.ogg" --recursive --noinput + docker-compose run --rm api python manage.py import_files $LIBRARY_ID "/music/" --recursive --noinput When you installed Funkwhale via ansible, you need to call a script instead of Python, and the folder path must be adapted accordingly: .. code-block:: bash export LIBRARY_ID="<your_libary_id>" - /srv/funkwhale/manage import_files $LIBRARY_ID "/srv/funkwhale/data/music/**/**/*.ogg" --recursive --noinput + /srv/funkwhale/manage import_files $LIBRARY_ID "/srv/funkwhale/data/music/" --recursive --noinput .. note:: You'll have to create a library in the Web UI before to get your library ID. Simply visit @@ -107,9 +107,9 @@ you can create a symlink like this:: ln -s /media/mynfsshare /srv/funkwhale/data/music/nfsshare And import music from this share with this command:: - + export LIBRARY_ID="<your_libary_id>" - python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/nfsshare/**/*.ogg" --recursive --noinput --in-place + python api/manage.py import_files $LIBRARY_ID "/srv/funkwhale/data/music/nfsshare/" --recursive --noinput --in-place On docker setups, it will require a bit more work, because while the ``/srv/funkwhale/data/music`` is mounted in containers, symlinked directories are not.