diff --git a/src/DocumentLibrary/Document.py b/src/DocumentLibrary/Document.py index b039905..446c26e 100644 --- a/src/DocumentLibrary/Document.py +++ b/src/DocumentLibrary/Document.py @@ -323,7 +323,7 @@ for p in self.propertyMap(): if p.get('type', '') in ['string', 'text'] and not p.get('hidden', 0): v = self.getProperty(p['id']) - if v: r.append(unicode(v, 'latin-1', 'ignore')) + if v: r.append(v) # Add the titles of the topics assigned to this document for t in self.topicMap(): diff --git a/src/DocumentLibrary/DocumentStore.py b/src/DocumentLibrary/DocumentStore.py index 48a5b65..09858bc 100644 --- a/src/DocumentLibrary/DocumentStore.py +++ b/src/DocumentLibrary/DocumentStore.py @@ -390,7 +390,7 @@ """Deletes a document from the document store""" document=self.query(id=REQUEST.id)[0] - topics=document.topics + topics=getattr(document, 'topics', []) self.manage_delObjects(REQUEST.id) self.afterDelete(topics) diff --git a/src/DocumentLibrary/etools.py b/src/DocumentLibrary/etools.py index 7d41904..8bb1afc 100755 --- a/src/DocumentLibrary/etools.py +++ b/src/DocumentLibrary/etools.py @@ -6,7 +6,7 @@ import os.path import re import sha -from OFS.Image import File +#from OFS.Image import File import sys search_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" @@ -14,25 +14,37 @@ tool = "zope-document-library" email = "alex@floop.org.uk" -def getMatches(title, authors, date): +def getMatches(title, authors, date, maxtitlewords=0, filterauthor=0): """ title is the title of the publication, authors is a list of surnames (optionally append initials) date is the publish date in the form YYYY/MM/DD (MM+DD optional) + if maxtitlewords is not 0, only the this number of the longest words + in the title will be used in the search """ - results = [] terms = [] if (title != None) and (title != ''): - terms.append(title) # should use [ti], but doesn't work as expected + if maxtitlewords == 0: + terms.append(title) # should use [ti], but doesn't work as expected + else: + longwords=title.split() + longwords.sort(lambda x, y: len(y) - len(x)) + for word in longwords[:maxtitlewords]: + terms.append(word + '[ti]') if (authors != None): for author in authors: if (author != ''): terms.append(author + '[au]') + if len(authors) > 0: + first_author = authors[0] + else: + first_author = None if (date != None): terms.append(date + '[dp]') search_term = string.join(terms, ' AND ') + print search_term params = urllib.urlencode({ 'db': 'pubmed', 'term': search_term, @@ -45,13 +57,18 @@ ids = [] for id_node in dom.getElementsByTagName('Id'): ids.append(id_node.firstChild.data) -# webenv = dom.getElementsByTagName('WebEnv')[0].firstChild.data + return getIds(ids, filterauthor, first_author) + +def getIds(ids, filterauthor=0, first_author=None): + results = [] params = urllib.urlencode({ 'db': 'pubmed', 'id': string.join(ids,','), 'report': 'xml', 'mode': 'text', - 'rettype': 'abstract' + 'rettype': 'abstract', + 'retmax': '100', + 'dispmax': '100' }) result = urllib.urlopen(fetch_url, params) # result = open('test2.xml') @@ -67,9 +84,19 @@ author_nodes = article.getElementsByTagName('Author') authors = [] for author_node in author_nodes: - author = author_node.getElementsByTagName('LastName')[0].firstChild.data - author = author + " " + author_node.getElementsByTagName('Initials')[0].firstChild.data - authors.append(author) + try: + author = author_node.getElementsByTagName('LastName')[0].firstChild.data + author = author + " " + author_node.getElementsByTagName('Initials')[0].firstChild.data + authors.append(author) + except: + pass + if (filterauthor != 0) and (first_author != None): # first author in search must be first author in returned list + if len(authors) < 0: + continue + lc_author = authors[0].lower() + if lc_author.find(first_author.lower()) == -1: + continue + article_info['authors'] = authors pubdate_nodes = article.getElementsByTagName('PubDate') date = None @@ -92,15 +119,38 @@ article_info['abstract'] = article.getElementsByTagName('AbstractText')[0].firstChild.data except: article_info['abstract'] = '' + try: + article_info['journal'] = pm_article.getElementsByTagName('MedlineTA')[0].firstChild.data + except: + article_info['journal'] = '' + try: + article_info['pages'] = article.getElementsByTagName('MedlinePgn')[0].firstChild.data + except: + article_info['pages'] = '' + try: + article_info['volume'] = article.getElementsByTagName('Volume')[0].firstChild.data + except: + article_info['volume'] = '' + try: + article_info['issue'] = article.getElementsByTagName('Issue')[0].firstChild.data + except: + article_info['issue'] = '' results.append(article_info) return results #print getMatches('Molecular evolution of CXC chemokines and receptors', -# ['Shields'], None) +# ['Shields'], None, 2) -print getMatches('Regulation of CD27 Expression on Subsets of Mature T-Lymphocytes', - ['Hintzen'], None) +#print getMatches('Regulation of CD27 Expression on Subsets of Mature T-Lymphocytes', +# ['Hintzen'], None, 2) +#print getMatches('IL12 therapy and cytokine production by PBMC in chronic Hepatitis C', +# ['Berg'], '2000') +#print getMatches('cyclic regulation of CD45 isoform expression in a long term human CD4posiCD45RAposi T cell line', +# ['Rothstein'], '1991', 1) +matches = getMatches(None, ['Chang'], '2002', 0) +print matches +print len(matches) file_re = re.compile('([^0-9]*)\s*([0-9]*)\s*(.*)(\.pdf|\.doc)') @@ -126,8 +176,8 @@ doc_author = mo.group(1) doc_year = mo.group(2) doc_title = mo.group(3) - if doc_title == '': - doc_title = os.path.split(filename)[1] +# if doc_title == '': +# doc_title = os.path.split(filename)[1] doc_ext = mo.group(4) file = open(filename) hash = sha.new(file.read())