001package com.hfg.citation.ncbi; 002 003import java.io.BufferedInputStream; 004import java.io.IOException; 005import java.io.InputStream; 006import java.net.HttpURLConnection; 007import java.util.ArrayList; 008import java.util.List; 009import java.util.regex.Matcher; 010import java.util.regex.Pattern; 011 012import com.hfg.bio.seq.format.SeqCitation; 013import com.hfg.citation.CitationRetriever; 014import com.hfg.citation.Journal; 015import com.hfg.util.StringBuilderPlus; 016import com.hfg.util.StringUtil; 017import com.hfg.util.User; 018import com.hfg.util.collection.CollectionUtil; 019import com.hfg.util.io.HTTPUtil; 020import com.hfg.xml.XMLTag; 021 022public class NCBI_eFetch implements CitationRetriever<SeqCitation> 023{ 024 private User mUser; 025 026 // https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=22368089&tool=my_tool&email=my_email@example.com 027 private String mBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"; 028 029 // https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids=10.1093/nar/gks1195 030 private String mIDConvertBaseURL = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"; 031 032 033 // Ex: Science. 2002 Nov 8;298(5596):1248-51 034 private static final Pattern JOURNAL_CITATION_PATTERN = Pattern.compile("([^\\.]+)\\.\\s+(\\d{4})[^\\;]+\\;(\\d+)(?:\\((\\d+)\\))?\\:([\\d\\-]+)"); 035 036 //--------------------------------------------------------------------------- 037 public NCBI_eFetch(User inUser) 038 { 039 mUser = inUser; 040 } 041 042 //########################################################################### 043 // PUBLIC METHODS 044 //########################################################################### 045 046 //--------------------------------------------------------------------------- 047 public String getBaseQueryURL() 048 { 049 return mBaseURL; 050 } 051 052 //--------------------------------------------------------------------------- 053 public NCBI_eFetch setBaseQueryURL(String inValue) 054 { 055 mBaseURL = inValue; 056 return this; 057 } 058 059 //--------------------------------------------------------------------------- 060 @Override 061 public MedlineCitation fetch(SeqCitation inQueryData) 062 throws IOException 063 { 064 if (! StringUtil.isSet(inQueryData.getPubMedId()) 065 && StringUtil.isSet(inQueryData.getDOI())) 066 { 067 lookupPubMedIdFromDOI(inQueryData); 068 } 069 070 String url = composeQueryURL(inQueryData); 071 HttpURLConnection conn = HTTPUtil.openConnection(url); 072 073 MedlineCitation citation = null; 074 075 int responseCode = conn.getResponseCode(); 076 if (200 == responseCode) 077 { 078 InputStream stream = new BufferedInputStream(conn.getInputStream()); 079 XMLTag xmlTag = new XMLTag(stream); 080 081 082 XMLTag pubmedArticleTag = xmlTag.getRequiredSubtagByName(PubmedXML.PUBMED_ARTICLE); 083 XMLTag medlineCitationTag = pubmedArticleTag.getRequiredSubtagByName(PubmedXML.MEDLINE_CITATION); 084 085 citation = new MedlineCitation(medlineCitationTag); 086 087 // References 088 XMLTag pubmedDataTag = pubmedArticleTag.getOptionalSubtagByName(PubmedXML.PUBMED_DATA); 089 if (pubmedDataTag != null) 090 { 091 XMLTag refListTag = pubmedDataTag.getOptionalSubtagByName(PubmedXML.REFERENCE_LIST); 092 if (refListTag != null) 093 { 094 citation.setReferences(parseReferences(refListTag)); 095 } 096 } 097 } 098 099 return citation; 100 } 101 102 //--------------------------------------------------------------------------- 103 private String composeQueryURL(SeqCitation inQueryData) 104 { 105 StringBuilderPlus url = new StringBuilderPlus(getBaseQueryURL()).setDelimiter("&") 106 .append("?") 107 .append("tool=com_hfg") 108 .delimitedAppend("email=" + mUser.getEmail()) 109 .delimitedAppend("db=pubmed") 110 .delimitedAppend("format=xml") 111 .delimitedAppend("id=" + inQueryData.getPubMedId()); 112 113 return url.toString(); 114 } 115 116 //--------------------------------------------------------------------------- 117 private void lookupPubMedIdFromDOI(SeqCitation inQueryData) 118 throws IOException 119 { 120 StringBuilderPlus url = new StringBuilderPlus(mIDConvertBaseURL).setDelimiter("&") 121 .append("?") 122 .append("tool=com_hfg") 123 .delimitedAppend("email=" + mUser.getEmail()) 124 .delimitedAppend("ids=" + inQueryData.getDOI()); 125 126 HttpURLConnection conn = HTTPUtil.openConnection(url.toString()); 127 128 InputStream stream = new BufferedInputStream(conn.getInputStream()); 129 130 XMLTag xmlTag = new XMLTag(stream); 131 132 /* 133 Example response: 134 135 <pmcids status="ok"> 136 <request idtype="doi" dois="" versions="yes" showaiid="no"> 137 <echo> 138 tool=my_tool;email=my_email%40example.com;ids=10.1093%2Fnar%2Fgks1195 139 </echo> 140 </request> 141 <record requested-id="10.1093/NAR/GKS1195" pmcid="PMC3531190" pmid="23193287" doi="10.1093/nar/gks1195"> 142 <versions> 143 <version pmcid="PMC3531190.1" current="true"/> 144 </versions> 145 </record> 146 </pmcids> 147 148 */ 149 List<XMLTag> recordTags = xmlTag.getSubtagsByName("record"); 150 if (CollectionUtil.hasValues(recordTags)) 151 { 152 XMLTag recordTag = recordTags.get(0); 153 inQueryData.setPubMedId(recordTag.getAttributeValue("pmid")); 154 } 155 } 156 157 //--------------------------------------------------------------------------- 158 private List<MedlineCitation> parseReferences(XMLTag inRefListTag) 159 { 160 List<MedlineCitation> references = null; 161 162 List<XMLTag> referenceTags = inRefListTag.getSubtagsByName(PubmedXML.REFERENCE); 163 if (CollectionUtil.hasValues(referenceTags)) 164 { 165 references = new ArrayList<>(referenceTags.size()); 166 for (XMLTag referenceTag : referenceTags) 167 { 168 MedlineCitation citation = new MedlineCitation(); 169 references.add(citation); 170 171 XMLTag citationTag = referenceTag.getOptionalSubtagByName(PubmedXML.CITATION); 172 if (citationTag != null) 173 { 174 citation.setRawContent(citationTag.getContent().trim()); 175 176 Matcher m = JOURNAL_CITATION_PATTERN.matcher(citation.toString()); 177 if (m.matches()) 178 { 179 Journal journal = new Journal().setAbbrev(m.group(1)); 180 citation.setJournal(journal); 181 182 citation.setYear(Integer.parseInt(m.group(2))); 183 citation.setVolume(m.group(3)); 184 citation.setIssue(m.group(4)); 185 citation.setPages(m.group(5)); 186 } 187 } 188 189 XMLTag articleIdListTag = referenceTag.getOptionalSubtagByName(PubmedXML.ARTICLE_ID_LIST); 190 if (articleIdListTag != null) 191 { 192 List<XMLTag> articleIdTags = referenceTag.getSubtagsByName(PubmedXML.ARTICLE_ID); 193 if (CollectionUtil.hasValues(articleIdTags)) 194 { 195 for (XMLTag articleIdTag : articleIdTags) 196 { 197 if (articleIdTag.getAttributeValue(PubmedXML.ID_TYPE_ATT).equalsIgnoreCase("pubmed")) 198 { 199 citation.setPubMedId(articleIdTag.getContent().trim()); 200 } 201 } 202 } 203 } 204 } 205 } 206 207 return references; 208 } 209}