import re
from urllib import urlencode, urlopen
__doc__ = \
"""This module allows Python programs to search freedb.org's website and
retrieve a list of AlbumEntry objects. From there, you can choose the
ones that you like and retrieve their xmcd information (an xmcdAlbum
object) with the load() method.
The main function is freedb_search(), which accepts nearly the same
parameters as the website. As it often returns many irrelevant searches
and I wish to do artist+album searches, I wrote soundex_freedb_search()
which will filter the output for items having the same soundex for artist
and album title. It is by no means foolproof, as the database is quite
noisy and soundex matching is fairly coarse.
Please let me know if you write any software that uses this.
Soundex module used to be available from http://heagy.com/linux/soundex/
It seems to have disappeared, so I've put up a copy at
http://bebop.bigasterisk.com/python/other/soundex.tar.gz
"""
__author__ = 'David McClosky (dmcc AT bigasterisk.com)'
__version__ = 1.00
"""A list of genres which freedb recognizes. The functions that deal with them
will also accept the pseudo-category 'all'."""
genres = ('blues', 'classical', 'country', 'data', 'folk', 'jazz', 'misc',
'newage', 'reggae', 'rock', 'soundtrack')
"""A list of criteria which you can search on. title means the title
of the album, track means the track titles. The pseudo-category 'all'
is also accepted."""
criterion = ('artist', 'title', 'track', 'rest')
_album_entry_re = re.compile(r'([^<]+)
(.*?)
', re.M | re.S | re.I)
_album_alt_entry_re = re.compile(r'', re.M | re.S | re.I)
_disc_len_re = re.compile(r'# Disc length: (\d+) sec')
_xmcd_line_re = re.compile(r'(.*?)=(.*)')
def listify(x):
"""If x is a list, do nothing. Otherwise, make a new tuple of x."""
try: # we'll try to iterate over it and see what happens
for item in x:
break # we can iterate over it, now break out of the loop
return x
except TypeError:
return (x,)
def pretty_time_range(diff):
"""Show a time difference or length as a pretty string: '1h23m45s' for
example. diff is in seconds."""
diff = int(diff)
hours, diff = divmod(diff, 3600)
minutes, seconds = divmod(diff, 60)
str = ''
if hours: str = '%sh' % hours
if minutes: str += '%sm' % minutes
if seconds: str += '%ss' % seconds
if not str: str = '%ss' % seconds
return str
class AlbumEntry:
"""Represents an entry for an album on the freedb search page.
Some album entries include multiple versions, use versions() to find
out how many exist."""
def __init__(self, category, checksum, artisttitle, alternates):
"""category and checksum specify which album this AlbumEntry
represents. artisttitle is the string that shows the artist
and album title. We will try to split it up by ' / ', but will
throw everything into artist if we fail. Alternates is the HTML
for links to alternate albums."""
self.category = category
self.checksum = checksum
try:
(self.artist, self.title) = artisttitle.split(' / ')
except:
self.artist = artisttitle
self.title = ''
self.alternates = _album_alt_entry_re.findall(alternates)
self.num_versions = len(self.alternates) + 1
def __str__(self):
"""Returns a description containing all information about the
album entry. The text in parentheses is the checksum and the
text in brackets is the number of versions."""
return "%s: %s (%s) [%d]" % (self.artist, self.title, self.checksum,
self.versions())
def url(self, version=0):
"""Given a version number, returns the URL for the xmcd page for
the album."""
if version == 0:
category, checksum = self.category, self.checksum
else:
category, checksum = self.alternates[version-1]
# xmcd page
url = 'http://www.freedb.org/freedb/%s/%s' % (category, checksum)
return url
def load(self, version=0):
"""Given a version number, returns an xmcdAlbum object for
the version. Loads the main version by default."""
url = urlopen(self.url(version))
html = url.read()
return xmcdAlbum(html)
def versions(self):
'Returns the number of versions for this AlbumEntry'
return self.num_versions
class xmcdTrack:
def __init__(self, title, extended, length, number):
self.title = title
self.extended = extended
self.length = length
self.number = number
def __str__(self):
s = '%2d: %s' % (self.number, self.title)
if self.extended:
s += ' (%s)' % self.extended
s += ' [%s]\n' % pretty_time_range(self.length)
return s
class xmcdAlbum:
"""Parses an XMCD text as a string or a file. Includes album information
as well as track information."""
def __init__(self, text='', filename=None):
"""If given a filename, it will load that file. Otherwise,
it will read the text in the text parameter."""
if filename:
file = open(filename)
text = file.read()
lines = text.splitlines()
self.frame_offsets = []
data = {}
# fo stands for frame offset. While parsing the xmcd file, here's what
# the fo_modes mean:
# 0: not seen yet, 1: means we're seeing it right now, 2: we're done
# with it
fo_mode = 0
for l in lines:
if fo_mode < 2 and l[0] == '#':
if fo_mode == 1:
try:
num = int(l[1:].strip())
self.frame_offsets.append(num)
except ValueError:
fo_mode = 2
elif l.find('Track frame offsets:') != -1:
fo_mode = 1
elif l[0] == '#':
match = _disc_len_re.search(l)
if match:
self.length = int(match.group(1))
else:
match = _xmcd_line_re.search(l)
if match:
k, v = match.group(1, 2)
data[k] = data.get(k, '') + v
else:
raise "xmcdParseError", l
# replace all \n with real newlines
for k, v in data.items():
data[k] = v.replace('\\n', '\n')
# add end of CD as last frame offset
self.frame_offsets.append(self.length * 75)
self.extended = data['EXTD']
count = 0
self.tracks, self.lengths = [], []
while 1:
try:
length = (self.frame_offsets[count+1] -
self.frame_offsets[count]) / 75.0
self.lengths.append(length)
self.tracks.append(xmcdTrack(data['TTITLE%d' % count],
data['EXTT%d' % count],
length, count+1))
count += 1
except IndexError: # when we get to the end, we will fall off the
# end
break
for a in ('discid', 'dtitle', 'dyear', 'dgenre', 'playorder'):
self.__dict__[a] = data['%s' % a.upper()]
try:
self.dyear = int(self.dyear)
except ValueError:
self.dyear = 0
try:
self.playorder = [int(n) for n in self.playorder.split(',')]
except ValueError: # if int() fails
pass
try:
(self.artist, self.title) = self.dtitle.split(' / ')
except:
(self.artist, self.title) = self.dtitle, ''
def get_tracks(self):
'Returns a list of xmcdTrack objects for this album'
return self.tracks
def __getitem__(self, track):
'Returns an xmcdTrack object of the requested track'
return self.tracks[track]
def __len__(self):
'Returns the number of tracks'
return len(self.tracks)
def get_disc_length(self):
'''Returns disc length in seconds'''
return self.length
def __str__(self):
'''A pretty representation of this object'''
if self.dyear:
year = " (%d)" % self.dyear
else:
year = ''
s = "%s%s [%s]\n" % (self.dtitle, year,
pretty_time_range(self.length))
if self.extended: s += self.extended + "\n"
for t in self.tracks:
s += str(t)
return s
def freedb_search(text, criteria='artist', categories='all'):
"""Performs a freedb search via the freedb website. Criteria is a
list of items from the criteria list"""
cgi_opts = [('words', text)]
if criteria is 'all':
cgi_opts.append(('allfields', 'YES'))
else:
cgi_opts.append(('allfields', 'NO'))
criteria = listify(criteria)
for c in criteria:
if c in criterion:
cgi_opts.append(('fields', c))
else:
raise "IllegalCriteria", c
if categories is 'all':
cgi_opts.append(('allcats', 'YES'))
else:
cgi_opts.append(('allcats', 'NO'))
categories = listify(categories)
for c in categories:
if c in genres:
cgi_opts.append(('cats', c))
else:
raise "IllegalCategory", c
cgi_opts.extend([('grouping', 'none'), ('x', 0), ('y', 0)])
url = urlopen('http://www.freedb.org/freedb_search.php?' +
urlencode(cgi_opts))
html = url.read()
albums = [AlbumEntry(*album_entry)
for album_entry in _album_entry_re.findall(html)]
return albums
def soundex_freedb_search(artist, album, artist_filter=1, album_filter=1,
categories='all'):
"""Soundex frontend to freedb_search: Will use filter the output,
depending on which filters are set. Returns two lists: The first
one contains albums that matched and the second one contains ones
that didn't match."""
import soundex
artist_soundex = soundex.soundex(artist)
album_soundex = soundex.soundex(album)
albums = freedb_search('%s %s' % (artist, album), categories=categories)
good, bad = [], []
for a in albums:
current_artist = soundex.soundex(a.artist)
current_album = soundex.soundex(a.title)
match = 1
artist_match = current_artist == artist_soundex
album_match = current_album == album_soundex
if artist_filter:
match = match and artist_match
if album_filter:
match = match and album_match
if match:
good.append(a)
else:
bad.append(a)
return good, bad
def test_freedb_search():
good, bad = soundex_freedb_search('thelonious monk', "straight no chaser")
print "Matches:"
for a in good:
print a
print "Non-matches:"
for a in bad:
print a
if __name__ == '__main__':
import sys
albums = freedb_search(' '.join(sys.argv[1:]))
for a in albums:
print a
for a in albums:
print a.load()