diff --git a/src/DocumentLibrary/FileConverters/pdf.py b/src/DocumentLibrary/FileConverters/pdf.py index 12707e0..7247ce7 100644 --- a/src/DocumentLibrary/FileConverters/pdf.py +++ b/src/DocumentLibrary/FileConverters/pdf.py @@ -1,6 +1,7 @@ """PDF to text file converter for Document Library""" from os import popen, remove +from codecs import lookup content_type = 'application/pdf' @@ -13,7 +14,13 @@ """Convert pdf data to raw text""" tmp_name = documentFile._writeToTempFile() - text = popen('pdftotext %s -' % tmp_name).read() + text_fd = popen('pdftotext %s -' % tmp_name) + # cope with default latin-1 encoding if possible + try: + reader = lookup('latin-1')[2](text_fd) + text = reader.read() + except: + text = text_fd.read() remove(tmp_name) return text