diff --git a/lib/utils.py b/lib/utils.py index 772ff6b..63083b0 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -3,9 +3,10 @@ from ConfigParser import RawConfigParser, NoOptionError, NoSectionError from os.path import dirname, splitext, basename, isfile +import re from os import devnull from subprocess import check_call, CalledProcessError, STDOUT -import unicodedata +import unidecode import logging ### CATEGORIES ### @@ -195,16 +196,8 @@ def upcaseFirstLetter(s): def cleanString(toclean): - toclean = toclean.split(' ') - cleaned = '' - for s in toclean: - if s == '': - continue - strtoclean = unicodedata.normalize('NFKD', unicode (s, 'utf-8')).encode('ASCII', 'ignore') - strtoclean = ''.join(e for e in strtoclean if e.isalnum()) - if strtoclean == '': - continue - strtoclean = upcaseFirstLetter(strtoclean) - cleaned = cleaned + strtoclean + toclean = toclean.decode('utf-8') + toclean = unidecode.unidecode(toclean) + cleaned = re.sub('[^A-Za-z0-9]+', '', toclean) return cleaned