Source code for lyricsmaster.providers

# -*- coding: utf-8 -*-

"""Main module.

This module defines the Api interface for the various Lyrics providers.
All lyrics providers inherit from the base class LyricsProvider.

"""

# We use abstract methods to ensure that all future classes inheriting from LyricsProvider will
# implement the required methods in order to have a nice and consistent API.
from abc import ABCMeta, abstractmethod

import re
import urllib3
from urllib.parse import quote, urlsplit, urlunsplit
import certifi
from bs4 import BeautifulSoup

# We use gevent in order to make asynchronous http requests while downloading lyrics.
# It is also used to patch the socket module to use SOCKS5 instead to interface with the Tor controller.
import gevent.monkey
from gevent.pool import Pool

# Python 2.7 compatibility
# Works for Python 2 and 3
try:
    from importlib import reload
except ImportError:
    try:
        from imp import reload
    except:
        pass

# Importing the app models and utilities
from .models import Song, Album, Discography
from .utils import normalize, logger

# TODO: advertise the fact that contributors can add new lyrics providers by conforming the Provider metaclass
[docs]class LyricsProvider: """ This is the base class for all Lyrics Providers. If you wish to subclass this class, you must implement all the methods defined in this class to be compatible with the LyricsMaster API. Requests to fetch songs are executed asynchronously for better performance. Tor anonymisation is provided if tor is installed on the system and a TorController is passed at instance creation. :param tor_controller: TorController Object. """ __metaclass__ = ABCMeta name = '' def __init__(self, tor_controller=None): if not self.__socket_is_patched(): gevent.monkey.patch_socket() self.tor_controller = tor_controller if not self.tor_controller: user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'} self.session = urllib3.PoolManager(maxsize=10, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where(), headers=user_agent) else: self.session = self.tor_controller.get_tor_session() self.__tor_status__() def __repr__(self): return '{0}.{1}({2})'.format(__name__, self.__class__.__name__, self.tor_controller.__repr__()) def __tor_status__(self): """ Informs the user of the Tor status. """ if not self.tor_controller: logger.info('Anonymous requests disabled. The connexion will not be anonymous.') elif self.tor_controller and not self.tor_controller.controlport: logger.info('Anonymous requests enabled. The Tor circuit will change according to the Tor network defaults.') else: logger.info('Anonymous requests enabled. The Tor circuit will change for each album.') def __socket_is_patched(self): """ Checks if the socket is patched or not. :return: bool. """ return gevent.monkey.is_module_patched('socket') @abstractmethod def _has_lyrics(self, page): """ Must be implemented by children classes conforming to the LyricsMaster API. Checks if the lyrics provider has the lyrics for the song or not. :param page: BeautifulSoup object. :return: bool. """ pass @abstractmethod def _has_artist(self, page): """ Must be implemented by children classes conforming to the LyricsMaster API. Check if the artist is in the lyrics provider's database. :param page: BeautifulSoup object. :return: bool. """ pass @abstractmethod def _make_artist_url(self, artist): """ Must be implemented by children classes conforming to the LyricsMaster API. Builds an url for the artist page of the lyrics provider. :param artist: string. :return: string or None. """ pass @abstractmethod def _clean_string(self, text): """ Must be implemented by children classes conforming to the LyricsMaster API. Formats the text to conform to the lyrics provider formatting. :param text: :return: string or None. """ pass
[docs] @abstractmethod def get_albums(self, raw_artist_page): """ Must be implemented by children classes conforming to the LyricsMaster API. Fetches the albums section in the supplied html page. :param raw_artist_page: Artist's raw html page. :return: list. List of BeautifulSoup objects. """ pass
[docs] @abstractmethod def get_album_infos(self, tag): """ Must be implemented by children classes conforming to the LyricsMaster API. Extracts the Album informations from the tag :param tag: BeautifulSoup object. :return: tuple(string, string). Album title and release date. """ pass
[docs] @abstractmethod def get_songs(self, album): """ Must be implemented by children classes conforming to the LyricsMaster API. Fetches the links to the songs of the supplied album. :param album: BeautifulSoup object. :return: List of BeautifulSoup Link objects. """ pass
[docs] @abstractmethod def create_song(self, link, artist, album_title): """ Must be implemented by children classes conforming to the LyricsMaster API. Creates a Song object. :param link: BeautifulSoup Link object. :param artist: string. :param album_title: string. :return: models.Song object or None. """ pass
[docs] @abstractmethod def extract_lyrics(self, lyrics_page): """ Must be implemented by children classes conforming to the LyricsMaster API. Extracts the lyrics from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string or None. Formatted lyrics. """ pass
[docs] @abstractmethod def extract_writers(self, lyrics_page): """ Must be implemented by children classes conforming to the LyricsMaster API. Extracts the writers from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string or None. Song writers. """ pass
[docs] def get_page(self, url): """ Fetches the supplied url and returns a request object. :param url: string. :return: urllib3.response.HTTPResponse Object or None. """ if not self.__socket_is_patched(): gevent.monkey.patch_socket() try: split_url = list(urlsplit(url)) split_url[2:] = [quote(elmt, safe='/=+&%') for elmt in split_url[2:]] url = urlunsplit(split_url) req = self.session.request('GET', url, retries=30) except Exception as e: logger.exception(e) req = None logger.warning('Unable to download url ' + url) return req
[docs] def get_artist_page(self, artist): """ Fetches the web page for the supplied artist. :param artist: string. Artist name. :return: string or None. Artist's raw html page. None if the artist page was not found. """ artist = self._clean_string(artist) url = self._make_artist_url(artist) if not url: return None raw_html = self.get_page(url).data artist_page = BeautifulSoup(raw_html.decode('utf-8', 'ignore'), 'lxml') if not self._has_artist(artist_page): return None return raw_html
[docs] def get_lyrics_page(self, url): """ Fetches the web page containing the lyrics at the supplied url. :param url: string. Lyrics url. :return: string or None. Lyrics's raw html page. None if the lyrics page was not found. """ try: raw_html = self.get_page(url).data except AttributeError: return None lyrics_page = BeautifulSoup(raw_html.decode('utf-8', 'ignore'), 'lxml') if not self._has_lyrics(lyrics_page): return None return raw_html
[docs] def get_lyrics(self, artist, album=None, song=None): """ This is the main method of this class. Connects to the Lyrics Provider and downloads lyrics for all the albums of the supplied artist and songs. Returns a Discography Object or None if the artist was not found on the Lyrics Provider. :param artist: string. Artist name. :param album: string. Album title. :param song: string. Song title. :return: models.Discography object or None. """ raw_html = self.get_artist_page(artist) if not raw_html: logger.warning('{0} was not found on {1}'.format(artist, self.name)) return None albums = self.get_albums(raw_html) if album: # If user supplied a specific album albums = [elmt for elmt in albums if album.lower() in self.get_album_infos(elmt)[0].lower()] album_objects = [] for elmt in albums: try: album_title, release_date = self.get_album_infos(elmt) except ValueError as e: logger.warning('Error {0} while downloading {1}'.format(e, album_title)) continue song_links = self.get_songs(elmt) song_links = [link for link in song_links if link] if song: # If user supplied a specific song song_links = [link for link in song_links if song.lower() in link.text.lower()] if self.tor_controller and self.tor_controller.controlport: # Renew Tor circuit before starting downloads. self.tor_controller.renew_tor_circuit() self.session = self.tor_controller.get_tor_session() if song_links: logger.info('Downloading {0}'.format(album_title)) pool = Pool(25) # Sets the worker pool for async requests. 25 is a nice value to not annoy site owners ;) results = [pool.spawn(self.create_song, *(link, artist, album_title)) for link in song_links] pool.join() # Gathers results from the pool songs = [song.value for song in results if song.value] if songs: album_obj = Album(album_title, artist, songs, release_date) album_objects.append(album_obj) logger.info('{0} successfully downloaded'.format(album_title)) else: logger.info('Skipped downloading {0} as no lyrics matched.'.format(album_title)) discography = Discography(artist, album_objects) return discography
[docs]class LyricWiki(LyricsProvider): """ Class interfacing with http://lyrics.wikia.com . This class is used to retrieve lyrics from LyricWiki. """ base_url = 'http://lyrics.wikia.com' name = 'LyricWiki' def _has_lyrics(self, lyrics_page): """ Checks if the lyrics provider has the lyrics for the song or not. :param lyrics_page: BeautifulSoup object. :return: bool. """ return not lyrics_page.find("div", {'class': 'noarticletext'}) _has_artist = _has_lyrics def _make_artist_url(self, artist): """ Builds an url for the artist page of the lyrics provider. :param artist: string. :return: string. """ url = self.base_url + '/wiki/' + artist return url
[docs] def get_album_page(self, artist, album): """ Fetches the album page for the supplied artist and album. :param artist: string. Artist name. :param album: string. Album title. :return: string or None. Album's raw html page. None if the album page was not found. """ artist = self._clean_string(artist) album = self._clean_string(album) url = self.base_url + '/wiki/' + artist + ':' + album raw_html = self.get_page(url).data album_page = BeautifulSoup(raw_html.decode('utf-8', 'ignore'), 'lxml') if album_page.find("div", {'class': 'noarticletext'}): return None return raw_html
[docs] def get_albums(self, raw_artist_page): """ Fetches the albums section in the supplied html page. :param raw_artist_page: Artist's raw html page. :return: list. List of BeautifulSoup objects. """ artist_page = BeautifulSoup(raw_artist_page.decode('utf-8', 'ignore'), 'lxml') albums = [tag for tag in artist_page.find_all("span", {'class': 'mw-headline'}) if tag.attrs['id'] not in ('Additional_information', 'External_links')] return albums
[docs] def get_album_infos(self, tag): """ Extracts the Album informations from the tag :param tag: BeautifulSoup object. :return: tuple(string, string). Album title and release date. """ try: i = tag.text.index(' (') release_date = re.findall(r'\(([^()]+)\)', tag.text)[0] except ValueError: i = -1 release_date = 'Unknown' album_title = tag.text[:i] return album_title, release_date
[docs] def get_songs(self, album): """ Fetches the links to the songs of the supplied album. :param album: BeautifulSoup object. :return: List of BeautifulSoup Link objects. """ parent_node = album.parent while parent_node.name != 'ol': parent_node = parent_node.next_sibling song_links = [elmt.find('a') for elmt in parent_node.find_all('li')] return song_links
[docs] def create_song(self, link, artist, album_title): """ Creates a Song object. :param link: BeautifulSoup Link object. :param artist: string. :param album_title: string. :return: models.Song object or None. """ if not link.attrs['href'].startswith(self.base_url): song_url = self.base_url + link.attrs['href'] else: song_url = link.attrs['href'] song_title = link.attrs['title'] song_title = song_title[song_title.index(':') + 1:] if '(page does not exist' in song_title: return None raw_lyrics_page = self.get_lyrics_page(song_url) if not raw_lyrics_page: return None lyrics_page = BeautifulSoup(raw_lyrics_page.decode('utf-8', 'ignore'), 'lxml') lyrics = self.extract_lyrics(lyrics_page) writers = self.extract_writers(lyrics_page) song = Song(song_title, album_title, artist, lyrics, writers) return song
[docs] def extract_lyrics(self, lyrics_page): """ Extracts the lyrics from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string or None. Formatted lyrics. """ lyric_box = lyrics_page.find("div", {'class': 'lyricbox'}) lyrics = '\n'.join(lyric_box.strings) return lyrics
[docs] def extract_writers(self, lyrics_page): """ Extracts the writers from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string or None. Song writers. """ writers_box = lyrics_page.find("table", {'class': 'song-credit-box'}) if writers_box: writers = writers_box.find_all('p')[-1].text.strip() else: writers = None return writers
def _clean_string(self, text): """ Cleans the supplied string and formats it to use in a url. :param text: string. Text to be cleaned. :return: string. Cleaned text. """ for elmt in [('#', 'Number_'), ('[', '('), (']', ')'), ('{', '('), ('}', ')'), (' ', '_')]: text = text.replace(*elmt) return text
[docs]class AzLyrics(LyricsProvider): # TODO: Check why Qzlyrics randomly generates 'ProtocolError('Connection aborted.', BadStatusLine("''",))' (Caused by ProtocolError('Connection aborted.', BadStatusLine("''",))) """ Class interfacing with https://azlyrics.com . This class is used to retrieve lyrics from AzLyrics. """ base_url = 'https://www.azlyrics.com' search_url = 'https://search.azlyrics.com/search.php?q=' name = 'AzLyrics' def _has_lyrics(self, lyrics_page): """ Checks if the lyrics provider has the lyrics for the song or not. :param lyrics_page: BeautifulSoup object. :return: bool. """ if lyrics_page.find("div", {'class': 'lyricsh'}): return True else: return False def _has_artist(self, page): """ Check if the artist is in the lyrics provider's database. :param page: BeautifulSoup object. :return: bool. """ if page.find("div", {'id': 'listAlbum'}): return True else: return False def _has_artist_result(self, page): """ Checks if the lyrics provider has the lyrics for the song or not. :param page: BeautifulSoup object. :return: bool. """ artist_result = page.find("div", {'class': 'panel-heading'}) if artist_result.find('b').text == 'Artist results:': return True else: return False def _has_song_result(self, page): """ Checks if the lyrics provider has the lyrics for the song or not. :param page: BeautifulSoup object. :return: bool. """ artist_result = page.find("div", {'class': 'panel-heading'}) if artist_result.find('b').text == 'Song results:': return True else: return False def _make_artist_url(self, artist): """ Builds an url for the artist page of the lyrics provider. :param artist: string. :return: string. """ return self.search(artist)
[docs] def search(self, artist): """ Searches for the artist in the supplier's database. :param artist: Artist's name. :return: url or None. Url to the artist's page if found. None if not Found. """ artist = artist.replace(' ', '+') if artist.lower().startswith('the'): artist = artist[4:] url = self.search_url + artist search_results = self.get_page(url).data results_page = BeautifulSoup(search_results.decode('utf-8', 'ignore'), 'lxml') if not self._has_artist_result(results_page): return None target_node = results_page.find("div", {'class': 'panel-heading'}).find_next_sibling("table") artist_url = target_node.find('a').attrs['href'] if not artist_url: return None if not artist_url.startswith(self.base_url): artist_url = self.base_url + artist_url return artist_url
[docs] def get_albums(self, raw_artist_page): """ Fetches the albums section in the supplied html page. :param raw_artist_page: Artist's raw html page. :return: list. List of BeautifulSoup objects. """ artist_page = BeautifulSoup(raw_artist_page.decode('utf-8', 'ignore'), 'lxml') albums = [tag for tag in artist_page.find_all("div", {'id': 'listAlbum'})] return albums
[docs] def get_album_infos(self, tag): """ Extracts the Album informations from the tag :param tag: BeautifulSoup object. :return: tuple(string, string). Album title and release date. """ album_infos = tag.find("div", {'class': 'album'}).text album_title = re.findall(r'"([^"]*)"', album_infos)[0] try: release_date = re.findall(r'\(([^()]+)\)', tag.text)[0] except ValueError: release_date = 'Unknown' return album_title, release_date
[docs] def get_songs(self, album): """ Fetches the links to the songs of the supplied album. :param album: BeautifulSoup object. :return: List of BeautifulSoup Link objects. """ song_links = album.find_all('a') song_links = [song for song in song_links if 'href' in song.attrs] return song_links
[docs] def create_song(self, link, artist, album_title): """ Creates a Song object. :param link: BeautifulSoup Link object. :param artist: string. :param album_title: string. :return: models.Song object or None. """ song_title = link.text raw_lyrics_page = self.get_lyrics_page(self.base_url + link.attrs['href'].replace('..', '')) if not raw_lyrics_page: return None lyrics_page = BeautifulSoup(raw_lyrics_page.decode('utf-8', 'ignore'), 'lxml') lyrics = self.extract_lyrics(lyrics_page) writers = self.extract_writers(lyrics_page) song = Song(song_title, album_title, artist, lyrics, writers) return song
[docs] def extract_lyrics(self, lyrics_page): """ Extracts the lyrics from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string. Formatted lyrics. """ lyric_box = lyrics_page.find("div", {"class": None, "id": None}) lyrics = ''.join(lyric_box.strings) return lyrics
[docs] def extract_writers(self, lyrics_page): """ Extracts the writers from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string or None. Song writers or None. """ writers_box = lyrics_page.find_all("div", {'class': 'smt'}) if writers_box: writers = writers_box[-1].text.strip() else: writers = None return writers
def _clean_string(self, text): """ Cleans the supplied string and formats it to use in a url. :param text: string. Text to be cleaned. :return: string. Cleaned text. """ return text
[docs]class Genius(LyricsProvider): """ Class interfacing with https://genius.com . This class is used to retrieve lyrics from Genius. """ base_url = 'https://genius.com' search_url = base_url + '/search?q=' name = 'Genius' def _has_lyrics(self, page): """ Checks if the lyrics provider has the lyrics for the song or not. :param page: BeautifulSoup object. :return: bool. """ if page.find("div", {'class': 'song_body-lyrics'}): return True else: return False def _has_artist(self, page): """ Check if the artist is in the lyrics provider's database. :param page: BeautifulSoup object. :return: bool. """ if not page.find("div", {'class': 'render_404'}): return True else: return False def _make_artist_url(self, artist): """ Builds an url for the artist page of the lyrics provider. :param artist: string. :return: string. """ url = self.base_url + '/artists/' + artist return url
[docs] def get_albums(self, raw_artist_page): """ Fetches the albums section in the supplied html page. :param raw_artist_page: Artist's raw html page. :return: list. List of BeautifulSoup objects. """ artist_page = BeautifulSoup(raw_artist_page.decode('utf-8', 'ignore'), 'lxml') albums_link = artist_page.find("a", {'class': 'full_width_button'}) albums_link = albums_link.attrs['href'].replace('songs?', 'albums?') albums_page = BeautifulSoup(self.get_page(self.base_url + albums_link).data.decode('utf-8', 'ignore'), 'lxml') albums = [tag for tag in albums_page.find_all("a", {'class': 'album_link'})] return albums
[docs] def get_album_infos(self, tag): """ Extracts the Album informations from the tag :param tag: BeautifulSoup object. :return: tuple(string, string). Album title and release date. """ album_title = tag.text album_page = BeautifulSoup(self.get_page(self.base_url + tag.attrs['href']).data.decode('utf-8', 'ignore'), 'lxml') info_box = album_page.find("div", {'class': 'header_with_cover_art-primary_info'}) metadata = [elmt for elmt in info_box.find_all("div", {'class': 'metadata_unit'}) if elmt.text.startswith('Released')] try: release_date = metadata[0].text except IndexError: release_date = 'Unknown' except ValueError: release_date = 'Unknown' return album_title, release_date
[docs] def get_songs(self, album): """ Fetches the links to the songs of the supplied album. :param album: BeautifulSoup object. :return: List of BeautifulSoup Link objects. """ album_page = BeautifulSoup(self.get_page(self.base_url + album.attrs['href']).data.decode('utf-8', 'ignore'), 'lxml') song_links = album_page.find_all("div", {'class': 'chart_row chart_row--light_border chart_row--full_bleed_left chart_row--align_baseline chart_row--no_hover'}) song_links = [song.find('a') for song in song_links] return song_links
[docs] def create_song(self, link, artist, album_title): """ Creates a Song object. :param link: BeautifulSoup Link object. :param artist: string. :param album_title: string. :return: models.Song object or None. """ if not link.attrs['href'].startswith(self.base_url): song_url = self.base_url + link.attrs['href'] else: song_url = link.attrs['href'] song_title = link.text.strip('\n').split('\n')[0].lstrip() raw_lyrics_page = self.get_lyrics_page(song_url) if not raw_lyrics_page: return None lyrics_page = BeautifulSoup(raw_lyrics_page.decode('utf-8', 'ignore'), 'lxml') lyrics = self.extract_lyrics(lyrics_page) writers = self.extract_writers(lyrics_page) song = Song(song_title, album_title, artist, lyrics, writers) return song
[docs] def extract_lyrics(self, lyrics_page): """ Extracts the lyrics from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string. Formatted lyrics. """ lyric_box = lyrics_page.find("div", {"class": 'lyrics'}) lyrics = ''.join(lyric_box.strings) return lyrics
[docs] def extract_writers(self, lyrics_page): """ Extracts the writers from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string. Song writers or None. """ writers_box = [elmt for elmt in lyrics_page.find_all("span", {'class': 'metadata_unit-label'}) if elmt.text == "Written By"] if writers_box: target_node = writers_box[0].find_next_sibling("span", {'class': 'metadata_unit-info'}) writers = target_node.text.strip() else: writers = None return writers
def _clean_string(self, text): """ Cleans the supplied string and formats it to use in a url. :param text: string. Text to be cleaned. :return: string. Cleaned text. """ text = normalize(text).lower().capitalize() return text
[docs]class Lyrics007(LyricsProvider): """ Class interfacing with https://www.lyrics007.com . This class is used to retrieve lyrics from Lyrics007. """ base_url = 'https://www.lyrics007.com' search_url = base_url + '/search.php?category=artist&q=' name = 'Lyrics007' def _has_lyrics(self, page): """ Checks if the lyrics provider has the lyrics for the song or not. :param page: BeautifulSoup object. :return: bool. """ if page.find("div", {'class': 'lyrics'}): return True else: return False def _has_artist(self, page): """ Check if the artist is in the lyrics provider's database. :param page: BeautifulSoup object :return: bool. """ if page.find("ul", {'class': 'song_title'}): return True else: return False def _has_artist_result(self, page): """ Check if the artist is in the lyrics provider's database. :param page: BeautifulSoup object. :return: bool. """ artist_link = page.find("div", {'id': 'search_result'}).find('a') if artist_link: return True else: return False def _make_artist_url(self, artist): """ Builds an url for the artist page of the lyrics provider. :param artist: string. :return: string. """ return self.search(artist)
[docs] def search(self, artist): """ Searches for the artist in the supplier's database. :param artist: string. Artist's name. :return: string or None. Artist's url page. """ artist = "".join([c if (c.isalnum() or c == '.') else "+" for c in artist]) url = self.search_url + artist search_results = self.get_page(url).data results_page = BeautifulSoup(search_results.decode('utf-8', 'ignore'), 'lxml') if not self._has_artist_result(results_page): return None artist_url = results_page.find("div", {'id': 'search_result'}).find('a').attrs['href'] if not artist_url: return None if not artist_url.startswith(self.base_url): artist_url = self.base_url + artist_url return artist_url
[docs] def get_albums(self, raw_artist_page): """ Fetches the albums section in the supplied html page. :param raw_artist_page: Artist's raw html page. :return: list. List of BeautifulSoup objects. """ artist_page = BeautifulSoup(raw_artist_page.decode('utf-8', 'ignore'), 'lxml') content = artist_page.find("div", {'class': 'content'}) albums = [tag for tag in content.find_all('li', recursive=False)] return albums
[docs] def get_album_infos(self, tag): """ Extracts the Album informations from the tag :param tag: BeautifulSoup object. :return: tuple(string, string). Album title and release date. """ infos = tag.text.split(': ') if len(infos) == 2: release_date, album_title = infos else: release_date = 'Unknown' album_title = infos[0] return album_title, release_date
[docs] def get_songs(self, album): """ Fetches the links to the songs of the supplied album. :param album: BeautifulSoup object. :return: List of BeautifulSoup Link objects. """ target_node = album.find_next_sibling("ul") song_links = [elmt.find('a') for elmt in target_node.find_all('li') if elmt.find('a')] return song_links
[docs] def create_song(self, link, artist, album_title): """ Creates a Song object. :param link: BeautifulSoup Link object. :param artist: string. :param album_title: string. :return: models.Song object or None. """ if not link.attrs['href'].startswith(self.base_url): song_url = self.base_url + link.attrs['href'] else: song_url = link.attrs['href'] song_title = link.text raw_lyrics_page = self.get_lyrics_page(song_url) if not raw_lyrics_page: return None lyrics_page = BeautifulSoup(raw_lyrics_page.decode('utf-8', 'ignore'), 'lxml') lyrics = self.extract_lyrics(lyrics_page) writers = self.extract_writers(lyrics_page) song = Song(song_title, album_title, artist, lyrics, writers) return song
[docs] def extract_lyrics(self, lyrics_page): """ Extracts the lyrics from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string. Formatted lyrics. """ lyric_box = lyrics_page.find("div", {'class': 'lyrics'}) lyrics = '\n'.join(lyric_box.strings) return lyrics
[docs] def extract_writers(self, lyrics_page): """ Extracts the writers from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string. Song writers or None. """ writers_box = [elmt for elmt in lyrics_page.strings if elmt.lower().startswith('writers:') or elmt.lower().startswith('writer:')] if writers_box: writers = writers_box[0].strip() else: writers = None return writers
def _clean_string(self, text): """ Cleans the supplied string and formats it to use in a url. :param text: string. Text to be cleaned. :return: string. Cleaned text. """ return text
[docs]class MusixMatch(LyricsProvider): """ Class interfacing with https://www.musixmatch.com . This class is used to retrieve lyrics from MusixMatch. """ base_url = 'https://www.musixmatch.com' search_url = base_url + '/search/{0}/artists' name = 'MusixMatch' def _has_lyrics(self, page): """ Checks if the lyrics provider has the lyrics for the song or not. :param page: BeautifulSoup object. :return: bool. """ if page.find("div", {'class': 'mxm-lyrics'}): return True else: return False def _has_artist(self, page): """ Check if the artist is in the lyrics provider's database. :param page: BeautifulSoup object. :return: bool. """ if page.find("div", {'class': 'artist-page main-wrapper'}): return True else: return False def _make_artist_url(self, artist): """ Builds an url for the artist page of the lyrics provider. :param artist: string. :return: string. """ return self.base_url + '/artist/' + artist
[docs] def get_albums(self, raw_artist_page): """ Fetches the albums section in the supplied html page. :param raw_artist_page: Artist's raw html page. :return: list. List of BeautifulSoup objects. """ artist_page = BeautifulSoup(raw_artist_page.decode('utf-8', 'ignore'), 'lxml') albums_link = artist_page.find("li", {'id': 'albums'}) albums_link = albums_link.find('a').attrs['href'] albums_page = BeautifulSoup(self.get_page(self.base_url + albums_link).data.decode('utf-8', 'ignore'), 'lxml') albums = [tag for tag in albums_page.find_all("div", {'class': 'media-card-text'})] return albums
[docs] def get_album_infos(self, tag): """ Extracts the Album informations from the tag :param tag: BeautifulSoup object. :return: tuple(string, string). Album title and release date. """ album_title = tag.find('h2').text try: release_date = tag.find('h3').text except AttributeError: release_date = 'Unknown' return album_title, release_date
[docs] def get_songs(self, album): """ Fetches the links to the songs of the supplied album. :param album: BeautifulSoup object. :return: List of BeautifulSoup Link objects. """ album_page = BeautifulSoup(self.get_page(self.base_url + album.find('a').attrs['href']).data.decode('utf-8', 'ignore'), 'lxml') album_div = album_page.find("div", {'class': 'mxm-album__tracks mxm-collection-container'}) song_links = album_div.find_all("li", {'class': re.compile("^mui-collection__item")}) song_links = [song.find('a') for song in song_links] return song_links
[docs] def create_song(self, link, artist, album_title): """ Creates a Song object. :param link: BeautifulSoup Link object. :param artist: string. :param album_title: string. :return: models.Song object or None. """ if not link.attrs['href'].startswith(self.base_url): song_url = self.base_url + link.attrs['href'] else: song_url = link.attrs['href'] song_title = link.text raw_lyrics_page = self.get_lyrics_page(song_url) if not raw_lyrics_page: return None lyrics_page = BeautifulSoup(raw_lyrics_page.decode('utf-8', 'ignore'), 'lxml') lyrics = self.extract_lyrics(lyrics_page) if not lyrics: return None writers = self.extract_writers(lyrics_page) song = Song(song_title, album_title, artist, lyrics, writers) return song
[docs] def extract_lyrics(self, lyrics_page): """ Extracts the lyrics from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string. Formatted lyrics. """ lyric_box = lyrics_page.find_all("p", {'class': re.compile("^mxm-lyrics__content")}) if lyric_box: lyrics = '\n'.join((elmt.string for elmt in lyric_box)) else: lyrics = None return lyrics
[docs] def extract_writers(self, lyrics_page): """ Extracts the writers from the lyrics page of the supplied song. :param lyrics_page: BeautifulSoup Object. BeautifulSoup lyrics page. :return: string. Song writers or None. """ writers_box = lyrics_page.find("p", {'class': re.compile("^mxm-lyrics__copyright")}) if writers_box: writers = writers_box.text.strip() else: writers = None return writers
def _clean_string(self, text): """ Cleans the supplied string and formats it to use in a url. :param text: string. Text to be cleaned. :return: string. Cleaned text. """ text = text.replace(' ', '-').replace('.', '-') if text[-1] == '-': text = text[:-1] return text
if __name__ == "__main__": pass