lyrics.py

import urllib.request
import html.parser
from bs4 import BeautifulSoup


def _get_html(url):
    with urllib.request.urlopen(url) as response:
        html = response.read()
    return html.decode('utf-8')


def extract_content(html):
    soup = BeautifulSoup(html, "html.parser")
    return soup.find_all("div", class_='lyricbox')[0].contents


def clean_content(contents):
    final_content = ""
    for e in contents:
        if e == '\n':
            continue
        if e.name == 'script':
            continue
        if e.name == 'br':
            final_content += "\n"
            continue
        try:
            final_content += e.text
        except AttributeError:
            final_content += str(e)
    return final_content