diff --git a/README.md b/README.md index 984a159abf50283fe7bf5e5f9f03dc20f086921a..5eb82e1661baf47b79b950b5f662954a10565f79 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,23 @@ +Install +------- + +`pip install .` + +Crawling the network by hand +---------------------------- + +``` +# Crawl the network starting from open.audio +funkwhale-network crawl open.audio +# Crawl the netowork starting from pods listed at https://network.funkwhale.audio +funkwhale-network crawl --use-public +# Limit crawl to 5 max successive passes +funkwhale-network crawl --use-public --passes 5 +# Show detailed results per domain +funkwhale-network crawl --use-public --detail "" +# Show detailed results per domain with specific columns, ordering by a given column +funkwhale-network crawl --use-public --detail "Domain,Listenings,Active users (30d)" --sort="Listenings" Running tests ------------- diff --git a/funkwhale_network/cli.py b/funkwhale_network/cli.py index 93ba2956d917124d57b961f2a222cb4543af77ec..f24d9d20c3c5ad031f904e914417a878b23ad381 100644 --- a/funkwhale_network/cli.py +++ b/funkwhale_network/cli.py @@ -4,11 +4,72 @@ import click import logging.config import arq.worker import functools +import ssl +import sys + +from . import output + +SSL_PROTOCOLS = (asyncio.sslproto.SSLProtocol,) +try: + import uvloop.loop +except ImportError: + pass +else: + SSL_PROTOCOLS = (*SSL_PROTOCOLS, uvloop.loop.SSLProtocol) + + +def ignore_aiohttp_ssl_eror(loop): + """Ignore aiohttp #3535 / cpython #13548 issue with SSL data after close + + There is an issue in Python 3.7 up to 3.7.3 that over-reports a + ssl.SSLError fatal error (ssl.SSLError: [SSL: KRB5_S_INIT] application data + after close notify (_ssl.c:2609)) after we are already done with the + connection. See GitHub issues aio-libs/aiohttp#3535 and + python/cpython#13548. + + Given a loop, this sets up an exception handler that ignores this specific + exception, but passes everything else on to the previous exception handler + this one replaces. + + Checks for fixed Python versions, disabling itself when running on 3.7.4+ + or 3.8. + + """ + if sys.version_info >= (3, 7, 4): + return + + orig_handler = loop.get_exception_handler() + + def ignore_ssl_error(loop, context): + if context.get("message") in { + "SSL error in data received", + "Fatal error on transport", + "SSL handshake failed", + "[SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error", + }: + # validate we have the right exception, transport and protocol + exception = context.get("exception") + protocol = context.get("protocol") + if ( + isinstance(exception, ssl.SSLError) + and exception.reason in ("KRB5_S_INIT", "TLSV1_ALERT_INTERNAL_ERROR") + and isinstance(protocol, SSL_PROTOCOLS) + ): + if loop.get_debug(): + asyncio.log.logger.debug("Ignoring asyncio SSL KRB5_S_INIT error") + return + if orig_handler is not None: + orig_handler(loop, context) + else: + loop.default_exception_handler(context) + + loop.set_exception_handler(ignore_ssl_error) def async_command(f): def wrapper(*args, **kwargs): loop = asyncio.get_event_loop() + ignore_aiohttp_ssl_eror(loop) return loop.run_until_complete(f(*args, **kwargs)) return functools.update_wrapper(wrapper, f) @@ -122,6 +183,124 @@ async def poll(domain): await pool.wait_closed() +NOOP = object() + + +@cli.command() +@click.argument("domain", type=str, nargs=-1) +@click.option("--use-public", is_flag=True) +@click.option("--detail", default=NOOP) +@click.option("--passes", type=click.INT, default=999) +@click.option("--sort", default="Active users (30d)") +@async_command +async def crawl(domain, use_public, detail, passes, sort): + """ + Crawl the network starting from the given domain(s). + """ + from . import crawler + from . import settings + + kwargs = crawler.get_session_kwargs() + async with aiohttp.ClientSession(**kwargs) as session: + if use_public: + url = "https://network.funkwhale.audio/api/domains?up=true" + click.echo("Retrieving list of public pods from {}…".format(url)) + response = await session.get(url) + json = await response.json() + domain = set([d["name"] for d in json["results"]]) + click.echo("Launching crawl with {} seed domains…".format(len(domain))) + results = await crawler.crawl_all( + session, *domain, stdout=click.echo, max_passes=passes + ) + + click.echo("Complete after {} passes:".format(results["pass_number"])) + aggregate = aggregate_crawl_results(results["results"]) + + if detail != NOOP: + + click.echo("") + click.echo("Info per domain") + click.echo("===============") + click.echo("") + + if not detail: + fields = [ + "Domain", + "Active users (30d)", + "Users", + "Listenings", + "Open registrations", + "Anonymous access", + "Private", + ] + else: + fields = detail.split(",") + + click.echo( + output.table( + results["results"].values(), type="Domain", fields=fields, sort=sort + ) + ) + + click.echo("") + click.echo("Aggregated data") + click.echo("===============") + click.echo("") + click.echo( + output.obj_table( + aggregate, + type="Summary", + fields=[ + "Domains", + "Active users (30d)", + "Active users (180d)", + "Users", + "Listenings", + "Tracks", + "Albums", + "Artists", + "Hours of music", + "Open registrations", + "Federation enabled", + "Anonymous access", + "Private", + ], + ) + ) + + +def aggregate_crawl_results(domains_info): + def count_true(values): + return sum([1 for v in values if v]) + + def permissive_sum(values): + return sum([v for v in values if v]) + + fields = { + "domain": len, + "usage_users_total": permissive_sum, + "usage_users_active_half_year": permissive_sum, + "usage_users_active_month": permissive_sum, + "usage_listenings_total": permissive_sum, + "library_tracks_total": permissive_sum, + "library_albums_total": permissive_sum, + "library_artists_total": permissive_sum, + "library_music_hours": permissive_sum, + "open_registrations": count_true, + "federation_enabled": count_true, + "anonymous_can_listen": count_true, + "private": count_true, + } + aggregate = {} + for field, handler in fields.items(): + values = [] + for info in domains_info.values(): + values.append(info[field]) + aggregate[field] = handler(values) + + return aggregate + + @worker.command() @click.option("-v", "--verbose", is_flag=True) @click.option("--check", is_flag=True) diff --git a/funkwhale_network/crawler.py b/funkwhale_network/crawler.py index 3181cd133f8379a115fe8971c699a99b103f4e8a..5a287903bc6fa5946d5d371428a99910391507a3 100644 --- a/funkwhale_network/crawler.py +++ b/funkwhale_network/crawler.py @@ -1,4 +1,5 @@ import aiohttp +import asyncio import sys from . import db @@ -24,14 +25,14 @@ async def fetch_nodeinfo(session, domain): async def get_well_known_data(session, domain, protocol="https"): url = f"https://{domain}/.well-known/nodeinfo" - response = await session.get(url) + response = await session.get(url, ssl=False) return await response.json() async def get_nodeinfo(session, nodeinfo): for link in nodeinfo.get("links", []): if link["rel"] == "http://nodeinfo.diaspora.software/ns/schema/2.0": - response = await session.get(link["href"]) + response = await session.get(link["href"], ssl=False) return await response.json() raise exceptions.NoNodeInfo() @@ -51,10 +52,77 @@ async def check(conn, session, domain, stdout=sys.stdout): await save_check(conn, cleaned_check) +async def crawl_all(session, *domains, stdout, max_passes): + data = { + "pending_domains": set(domains), + "valid_domains": set(), + "invalid_domains": set(), + "handled_domains": set(), + "results": {}, + "pass_number": 0, + } + + def print_pass(): + stdout( + "[Pass {pass_number}] {pending_domains} new domains to crawl, {handled_domains} checked, {valid_domains} valid".format( + pass_number=data["pass_number"], + pending_domains=len(data["pending_domains"]), + handled_domains=len(data["handled_domains"]), + valid_domains=len(data["valid_domains"]), + ) + ) + + while data["pending_domains"] and data["pass_number"] < max_passes: + data["pass_number"] += 1 + print_pass() + tasks = [ + crawl_single(session, domain, data) for domain in data["pending_domains"] + ] + await asyncio.wait(tasks) + if data["pass_number"] < max_passes: + print_pass() + + return data + + +async def crawl_single(session, domain, data): + try: + nodeinfo_data = await fetch_nodeinfo(session, domain) + cleaned_data = clean_check( + {"domain": domain, "up": True}, clean_nodeinfo(nodeinfo_data) + ) + except Exception as e: + data["invalid_domains"].add(domain) + return + finally: + data["pending_domains"].remove(domain) + data["handled_domains"].add(domain) + + nodes_url = recursive_getattr(nodeinfo_data, "metadata.knownNodesListUrl") + if nodes_url: + try: + await gather_known_nodes(session, nodes_url, data) + except: + pass + + data["valid_domains"].add(domain) + data["results"][domain] = cleaned_data + + +async def gather_known_nodes(session, url, data): + fetch_url = url + while fetch_url: + response = await session.get(fetch_url, ssl=False) + result = await response.json() + fetch_url = result.get("next") + known_domains = set([d["name"] for d in result["results"]]) + data["pending_domains"] |= known_domains - data["handled_domains"] + + def clean_nodeinfo(data): schema = schemas.NodeInfo2Schema() result = schema.load(data) - return result.data + return result def recursive_getattr(obj, key, permissive=True): diff --git a/funkwhale_network/output.py b/funkwhale_network/output.py new file mode 100644 index 0000000000000000000000000000000000000000..2d93dafa48a8d11990c73251bf50a1c7ccf43893 --- /dev/null +++ b/funkwhale_network/output.py @@ -0,0 +1,101 @@ +import tabulate + +FIELDS = { + "Domain": { + "Domain": {"field": "domain", "truncate": 0}, + "Active users (30d)": {"field": "usage_users_active_month"}, + "Active users (180d)": {"field": "usage_users_active_half_year"}, + "Users": {"field": "usage_users_total"}, + "Listenings": {"field": "usage_listenings_total"}, + "Tracks": {"field": "library_tracks_total"}, + "Albums": {"field": "library_albums_total"}, + "Artists": {"field": "library_artists_total"}, + "Hours of music": {"field": "library_music_hours"}, + "Open registrations": {"field": "open_registrations"}, + "Federation enabled": {"field": "federation_enabled"}, + "Anonymous access": {"field": "anonymous_can_listen"}, + "Private": {"field": "private"}, + }, + "Summary": { + "Domains": {"field": "domain"}, + "Active users (30d)": {"field": "usage_users_active_month"}, + "Active users (180d)": {"field": "usage_users_active_half_year"}, + "Users": {"field": "usage_users_total"}, + "Listenings": {"field": "usage_listenings_total"}, + "Tracks": {"field": "library_tracks_total"}, + "Albums": {"field": "library_albums_total"}, + "Artists": {"field": "library_artists_total"}, + "Hours of music": {"field": "library_music_hours"}, + "Open registrations": {"field": "open_registrations"}, + "Federation enabled": {"field": "federation_enabled"}, + "Anonymous access": {"field": "anonymous_can_listen"}, + "Private": {"field": "private"}, + }, +} + +TABLE_FORMATS = sorted(tabulate._table_formats.keys()) + + +def get_value(obj, config, truncate=30): + field = config["field"] + value = obj[field] + + if config.get("handler"): + value = config["handler"](value) + value = str(value) + if truncate and len(value) > truncate: + value = value[:truncate] + "…" + return value + + +def table(objects, fields, type, headers=True, format="simple", sort=None): + + configs = {} + + if sort: + reversed = sort.startswith("-") + sort = sort.lstrip("-") + sort_field = FIELDS[type][sort]["field"] + objects = sorted(objects, key=lambda v: v[sort_field] or 0, reverse=reversed) + for f in fields: + try: + configs[f] = FIELDS[type][f] + except KeyError: + try: + configs[f] = FIELDS["*"][f] + except KeyError: + raise ValueError("{} is not a valid field for type {}".format(f, type)) + + headers = fields if headers else [] + rows = [ + [ + get_value(obj, configs[f], truncate=configs[f].get("truncate", 30)) + for f in fields + ] + for obj in objects + ] + return tabulate.tabulate(rows, headers=headers, tablefmt=format) + + +def obj_table(obj, fields, type, headers=True, format="simple"): + """ + same as table(), but output a two-column table for a single object, + with fields on the left and values on the right + """ + configs = {} + + for f in fields: + try: + configs[f] = FIELDS[type][f] + except KeyError: + try: + configs[f] = FIELDS["*"][f] + except KeyError: + raise ValueError("{} is not a valid field for type {}".format(f, type)) + + rows = [ + (f, get_value(obj, configs[f], truncate=configs[f].get("truncate", 30))) + for f in fields + ] + + return tabulate.tabulate(rows, headers=[], tablefmt=format) diff --git a/funkwhale_network/schemas.py b/funkwhale_network/schemas.py index a0126b44da958a0b246f85d2a0853e4cfc8aca49..b9ef32e3085fe15f25fcadeccdb026859ba3e956 100644 --- a/funkwhale_network/schemas.py +++ b/funkwhale_network/schemas.py @@ -31,6 +31,9 @@ class SoftwareSchema(marshmallow.Schema): ) version = VersionField(required=True) + class Meta: + unknown = marshmallow.EXCLUDE + """ "openRegistrations": False, @@ -59,19 +62,31 @@ class SoftwareSchema(marshmallow.Schema): class StatisticsSchema(marshmallow.Schema): total = marshmallow.fields.Integer(required=True) + class Meta: + unknown = marshmallow.EXCLUDE + class UsageStatisticsSchema(StatisticsSchema): activeHalfyear = marshmallow.fields.Integer(required=False) activeMonth = marshmallow.fields.Integer(required=False) + class Meta: + unknown = marshmallow.EXCLUDE + class UsageSchema(marshmallow.Schema): users = marshmallow.fields.Nested(UsageStatisticsSchema, required=True) + class Meta: + unknown = marshmallow.EXCLUDE + class MusicSchema(marshmallow.Schema): hours = marshmallow.fields.Integer(required=False) + class Meta: + unknown = marshmallow.EXCLUDE + class LibraryMetadataSchema(marshmallow.Schema): anonymousCanListen = marshmallow.fields.Boolean(required=True) @@ -81,10 +96,16 @@ class LibraryMetadataSchema(marshmallow.Schema): artists = marshmallow.fields.Nested(StatisticsSchema, required=False) music = marshmallow.fields.Nested(MusicSchema, required=False) + class Meta: + unknown = marshmallow.EXCLUDE + class MetadataUsageSchema(marshmallow.Schema): listenings = marshmallow.fields.Nested(StatisticsSchema, required=False) + class Meta: + unknown = marshmallow.EXCLUDE + class MetadataSchema(marshmallow.Schema): nodeName = marshmallow.fields.String(required=True) @@ -92,6 +113,9 @@ class MetadataSchema(marshmallow.Schema): library = marshmallow.fields.Nested(LibraryMetadataSchema, required=True) usage = marshmallow.fields.Nested(MetadataUsageSchema, required=True) + class Meta: + unknown = marshmallow.EXCLUDE + class NodeInfo2Schema(marshmallow.Schema): software = marshmallow.fields.Nested(SoftwareSchema, required=True) @@ -100,4 +124,4 @@ class NodeInfo2Schema(marshmallow.Schema): metadata = marshmallow.fields.Nested(MetadataSchema, required=True) class Meta: - strict = True + unknown = marshmallow.EXCLUDE diff --git a/setup.cfg b/setup.cfg index ccafa9e6488f7ffaa1a57f8d07c379a8f71fb4f3..c0209a16874069a54408b9498225dd9d4c1dbc23 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,11 +22,13 @@ install_requires = aiohttp arq==0.15 click - marshmallow<3 + marshmallow>3,<4 semver asynctest django-environ webargs + tabulate + [options.entry_points] console_scripts = funkwhale-network = funkwhale_network.cli:main