diff --git a/src/DocumentLibrary/etools.py b/src/DocumentLibrary/etools.py new file mode 100755 index 0000000..7d41904 --- /dev/null +++ b/src/DocumentLibrary/etools.py @@ -0,0 +1,166 @@ +#!/usr/bin/python2.1 + +import string +import urllib +from xml.dom.minidom import parse +import os.path +import re +import sha +from OFS.Image import File +import sys + +search_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" +fetch_url = "http://www.ncbi.nlm.nih.gov/entrez/utils/pmfetch.fcgi" +tool = "zope-document-library" +email = "alex@floop.org.uk" + +def getMatches(title, authors, date): + """ + title is the title of the publication, + authors is a list of surnames (optionally append initials) + date is the publish date in the form YYYY/MM/DD (MM+DD optional) + """ + + results = [] + terms = [] + if (title != None) and (title != ''): + terms.append(title) # should use [ti], but doesn't work as expected + if (authors != None): + for author in authors: + if (author != ''): + terms.append(author + '[au]') + if (date != None): + terms.append(date + '[dp]') + + search_term = string.join(terms, ' AND ') + params = urllib.urlencode({ + 'db': 'pubmed', + 'term': search_term, + 'tool': tool, + 'email': email}) +# 'usehistory': 'y'}) + result = urllib.urlopen(search_url, params) +# result = open("test.xml") + dom = parse(result) + ids = [] + for id_node in dom.getElementsByTagName('Id'): + ids.append(id_node.firstChild.data) +# webenv = dom.getElementsByTagName('WebEnv')[0].firstChild.data + params = urllib.urlencode({ + 'db': 'pubmed', + 'id': string.join(ids,','), + 'report': 'xml', + 'mode': 'text', + 'rettype': 'abstract' + }) + result = urllib.urlopen(fetch_url, params) +# result = open('test2.xml') + dom = parse(result) + temp_out = open('/tmp/result.xml', 'wb') + temp_out.write(dom.toxml().encode('utf8')) + for pm_article in dom.getElementsByTagName('PubmedArticle'): + article_info = {} + pmid = pm_article.getElementsByTagName('PMID')[0].firstChild.data + article_info['pubmedId'] = pmid + article = pm_article.getElementsByTagName('Article')[0] + article_info['title'] = article.getElementsByTagName('ArticleTitle')[0].firstChild.data + author_nodes = article.getElementsByTagName('Author') + authors = [] + for author_node in author_nodes: + author = author_node.getElementsByTagName('LastName')[0].firstChild.data + author = author + " " + author_node.getElementsByTagName('Initials')[0].firstChild.data + authors.append(author) + article_info['authors'] = authors + pubdate_nodes = article.getElementsByTagName('PubDate') + date = None + if (len(pubdate_nodes)> 0): + year_nodes = pubdate_nodes[0].getElementsByTagName('Year') + if (len(year_nodes)> 0): + year = year_nodes[0].firstChild.data + month_nodes = pubdate_nodes[0].getElementsByTagName('Month') + if (len(month_nodes)> 0): + month = month_nodes[0].firstChild.data + monthMap = {'Jan': '01', 'Feb': '02', 'Mar': '03', + 'Apr': '04', 'May': '05', 'Jun': '06', + 'Jul': '07', 'Aug': '08', 'Sep': '09', + 'Oct': '10', 'Nov': '11', 'Dec': '12'} + date = year + '/' + monthMap[month] + '/01' + else: + date = year + '/01/01' + article_info['date'] = date + try: + article_info['abstract'] = article.getElementsByTagName('AbstractText')[0].firstChild.data + except: + article_info['abstract'] = '' + results.append(article_info) + + return results + +#print getMatches('Molecular evolution of CXC chemokines and receptors', +# ['Shields'], None) + +print getMatches('Regulation of CD27 Expression on Subsets of Mature T-Lymphocytes', + ['Hintzen'], None) + +file_re = re.compile('([^0-9]*)\s*([0-9]*)\s*(.*)(\.pdf|\.doc)') + +class NamedFile: + filename = '' + seek = None + read = None + tell = None + def __init__(self, filename, file): + self.filename = filename + self.file = file + self.seek = file.seek + self.read = file.read + self.tell = file.tell + +def addLocalDocument(store, filename, REQUEST): + mo = file_re.match(os.path.split(filename)[1]) + doc_author = '' + doc_year = '' + doc_title = '' + doc_ext = '' + if mo: + doc_author = mo.group(1) + doc_year = mo.group(2) + doc_title = mo.group(3) + if doc_title == '': + doc_title = os.path.split(filename)[1] + doc_ext = mo.group(4) + file = open(filename) + hash = sha.new(file.read()) + file.close() + results = store.query(hash=hash.hexdigest()) + if len(results) == 0: + upload_file = NamedFile(filename, open(filename)) + REQUEST.set('file', upload_file) + REQUEST.set('title', doc_title) + if (doc_author != ''): + REQUEST.set('creator', doc_author) + if (doc_year != ''): + REQUEST.set('date', doc_year + '/01/01') + REQUEST.set('topics', [doc_year]) + REQUEST.set('hash', hash.hexdigest()) + REQUEST.set('type', 'Article') + store.addDocumentFile(REQUEST) +# upload_file.file.close() + return "Document %s added" % filename + else: + return "Document %s hash matches existing document" % filename + +#print addDocument(None, '/home/Library/1991/Rothstein1991 cyclic regulation of CD45 isoform expression in a long term human CD4posiCD45RAposi T cell line.pdf') + +def addDocumentsUnder(store, directory, REQUEST): + results = [] + for entry in os.listdir(directory): + filename = os.path.join(directory, entry) + if os.path.isfile(filename): +# try: + results.append(addLocalDocument(store, filename, REQUEST)) +# except: +# results.append(string.join(sys.exc_info(), '\n')) + elif os.path.isdir(filename): + addDocumentsUnder(store, filename, REQUEST) + return ("addDocumentsUnder()

Adding documents under %s

"