Skip to content
Snippets Groups Projects
lyrics.py 739 B
Newer Older
  • Learn to ignore specific revisions
  • import urllib.request
    import html.parser
    from bs4 import BeautifulSoup
    
    
    def _get_html(url):
        with urllib.request.urlopen(url) as response:
            html = response.read()
        return html.decode('utf-8')
    
    
    def extract_content(html):
        soup = BeautifulSoup(html, "html.parser")
        return soup.find_all("div", class_='lyricbox')[0].contents
    
    
    def clean_content(contents):
        final_content = ""
        for e in contents:
            if e == '\n':
                continue
            if e.name == 'script':
                continue
            if e.name == 'br':
                final_content += "\n"
                continue
            try:
                final_content += e.text
            except AttributeError:
                final_content += str(e)
        return final_content