import re from urllib import urlencode, urlopen __doc__ = \ """This module allows Python programs to search's website and retrieve a list of AlbumEntry objects. From there, you can choose the ones that you like and retrieve their xmcd information (an xmcdAlbum object) with the load() method. The main function is freedb_search(), which accepts nearly the same parameters as the website. As it often returns many irrelevant searches and I wish to do artist+album searches, I wrote soundex_freedb_search() which will filter the output for items having the same soundex for artist and album title. It is by no means foolproof, as the database is quite noisy and soundex matching is fairly coarse. Please let me know if you write any software that uses this. Soundex module used to be available from It seems to have disappeared, so I've put up a copy at """ __author__ = 'David McClosky (dmcc AT' __version__ = 1.00 """A list of genres which freedb recognizes. The functions that deal with them will also accept the pseudo-category 'all'.""" genres = ('blues', 'classical', 'country', 'data', 'folk', 'jazz', 'misc', 'newage', 'reggae', 'rock', 'soundtrack') """A list of criteria which you can search on. title means the title of the album, track means the track titles. The pseudo-category 'all' is also accepted.""" criterion = ('artist', 'title', 'track', 'rest') _album_entry_re = re.compile(r'([^<]+)
', re.M | re.S | re.I) _album_alt_entry_re = re.compile(r'', re.M | re.S | re.I) _disc_len_re = re.compile(r'# Disc length: (\d+) sec') _xmcd_line_re = re.compile(r'(.*?)=(.*)') def listify(x): """If x is a list, do nothing. Otherwise, make a new tuple of x.""" try: # we'll try to iterate over it and see what happens for item in x: break # we can iterate over it, now break out of the loop return x except TypeError: return (x,) def pretty_time_range(diff): """Show a time difference or length as a pretty string: '1h23m45s' for example. diff is in seconds.""" diff = int(diff) hours, diff = divmod(diff, 3600) minutes, seconds = divmod(diff, 60) str = '' if hours: str = '%sh' % hours if minutes: str += '%sm' % minutes if seconds: str += '%ss' % seconds if not str: str = '%ss' % seconds return str class AlbumEntry: """Represents an entry for an album on the freedb search page. Some album entries include multiple versions, use versions() to find out how many exist.""" def __init__(self, category, checksum, artisttitle, alternates): """category and checksum specify which album this AlbumEntry represents. artisttitle is the string that shows the artist and album title. We will try to split it up by ' / ', but will throw everything into artist if we fail. Alternates is the HTML for links to alternate albums.""" self.category = category self.checksum = checksum try: (self.artist, self.title) = artisttitle.split(' / ') except: self.artist = artisttitle self.title = '' self.alternates = _album_alt_entry_re.findall(alternates) self.num_versions = len(self.alternates) + 1 def __str__(self): """Returns a description containing all information about the album entry. The text in parentheses is the checksum and the text in brackets is the number of versions.""" return "%s: %s (%s) [%d]" % (self.artist, self.title, self.checksum, self.versions()) def url(self, version=0): """Given a version number, returns the URL for the xmcd page for the album.""" if version == 0: category, checksum = self.category, self.checksum else: category, checksum = self.alternates[version-1] # xmcd page url = '' % (category, checksum) return url def load(self, version=0): """Given a version number, returns an xmcdAlbum object for the version. Loads the main version by default.""" url = urlopen(self.url(version)) html = return xmcdAlbum(html) def versions(self): 'Returns the number of versions for this AlbumEntry' return self.num_versions class xmcdTrack: def __init__(self, title, extended, length, number): self.title = title self.extended = extended self.length = length self.number = number def __str__(self): s = '%2d: %s' % (self.number, self.title) if self.extended: s += ' (%s)' % self.extended s += ' [%s]\n' % pretty_time_range(self.length) return s class xmcdAlbum: """Parses an XMCD text as a string or a file. Includes album information as well as track information.""" def __init__(self, text='', filename=None): """If given a filename, it will load that file. Otherwise, it will read the text in the text parameter.""" if filename: file = open(filename) text = lines = text.splitlines() self.frame_offsets = [] data = {} # fo stands for frame offset. While parsing the xmcd file, here's what # the fo_modes mean: # 0: not seen yet, 1: means we're seeing it right now, 2: we're done # with it fo_mode = 0 for l in lines: if fo_mode < 2 and l[0] == '#': if fo_mode == 1: try: num = int(l[1:].strip()) self.frame_offsets.append(num) except ValueError: fo_mode = 2 elif l.find('Track frame offsets:') != -1: fo_mode = 1 elif l[0] == '#': match = if match: self.length = int( else: match = if match: k, v =, 2) data[k] = data.get(k, '') + v else: raise "xmcdParseError", l # replace all \n with real newlines for k, v in data.items(): data[k] = v.replace('\\n', '\n') # add end of CD as last frame offset self.frame_offsets.append(self.length * 75) self.extended = data['EXTD'] count = 0 self.tracks, self.lengths = [], [] while 1: try: length = (self.frame_offsets[count+1] - self.frame_offsets[count]) / 75.0 self.lengths.append(length) self.tracks.append(xmcdTrack(data['TTITLE%d' % count], data['EXTT%d' % count], length, count+1)) count += 1 except IndexError: # when we get to the end, we will fall off the # end break for a in ('discid', 'dtitle', 'dyear', 'dgenre', 'playorder'): self.__dict__[a] = data['%s' % a.upper()] try: self.dyear = int(self.dyear) except ValueError: self.dyear = 0 try: self.playorder = [int(n) for n in self.playorder.split(',')] except ValueError: # if int() fails pass try: (self.artist, self.title) = self.dtitle.split(' / ') except: (self.artist, self.title) = self.dtitle, '' def get_tracks(self): 'Returns a list of xmcdTrack objects for this album' return self.tracks def __getitem__(self, track): 'Returns an xmcdTrack object of the requested track' return self.tracks[track] def __len__(self): 'Returns the number of tracks' return len(self.tracks) def get_disc_length(self): '''Returns disc length in seconds''' return self.length def __str__(self): '''A pretty representation of this object''' if self.dyear: year = " (%d)" % self.dyear else: year = '' s = "%s%s [%s]\n" % (self.dtitle, year, pretty_time_range(self.length)) if self.extended: s += self.extended + "\n" for t in self.tracks: s += str(t) return s def freedb_search(text, criteria='artist', categories='all'): """Performs a freedb search via the freedb website. Criteria is a list of items from the criteria list""" cgi_opts = [('words', text)] if criteria is 'all': cgi_opts.append(('allfields', 'YES')) else: cgi_opts.append(('allfields', 'NO')) criteria = listify(criteria) for c in criteria: if c in criterion: cgi_opts.append(('fields', c)) else: raise "IllegalCriteria", c if categories is 'all': cgi_opts.append(('allcats', 'YES')) else: cgi_opts.append(('allcats', 'NO')) categories = listify(categories) for c in categories: if c in genres: cgi_opts.append(('cats', c)) else: raise "IllegalCategory", c cgi_opts.extend([('grouping', 'none'), ('x', 0), ('y', 0)]) url = urlopen('' + urlencode(cgi_opts)) html = albums = [AlbumEntry(*album_entry) for album_entry in _album_entry_re.findall(html)] return albums def soundex_freedb_search(artist, album, artist_filter=1, album_filter=1, categories='all'): """Soundex frontend to freedb_search: Will use filter the output, depending on which filters are set. Returns two lists: The first one contains albums that matched and the second one contains ones that didn't match.""" import soundex artist_soundex = soundex.soundex(artist) album_soundex = soundex.soundex(album) albums = freedb_search('%s %s' % (artist, album), categories=categories) good, bad = [], [] for a in albums: current_artist = soundex.soundex(a.artist) current_album = soundex.soundex(a.title) match = 1 artist_match = current_artist == artist_soundex album_match = current_album == album_soundex if artist_filter: match = match and artist_match if album_filter: match = match and album_match if match: good.append(a) else: bad.append(a) return good, bad def test_freedb_search(): good, bad = soundex_freedb_search('thelonious monk', "straight no chaser") print "Matches:" for a in good: print a print "Non-matches:" for a in bad: print a if __name__ == '__main__': import sys albums = freedb_search(' '.join(sys.argv[1:])) for a in albums: print a for a in albums: print a.load()