001package com.hfg.citation.ncbi;
002
003import java.io.BufferedInputStream;
004import java.io.IOException;
005import java.io.InputStream;
006import java.net.HttpURLConnection;
007import java.util.ArrayList;
008import java.util.List;
009import java.util.regex.Matcher;
010import java.util.regex.Pattern;
011
012import com.hfg.bio.seq.format.SeqCitation;
013import com.hfg.citation.CitationRetriever;
014import com.hfg.citation.Journal;
015import com.hfg.util.StringBuilderPlus;
016import com.hfg.util.StringUtil;
017import com.hfg.util.User;
018import com.hfg.util.collection.CollectionUtil;
019import com.hfg.util.io.HTTPUtil;
020import com.hfg.xml.XMLTag;
021
022public class NCBI_eFetch implements CitationRetriever<SeqCitation>
023{
024   private User mUser;
025
026   // https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=22368089&tool=my_tool&email=my_email@example.com
027   private String mBaseURL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi";
028
029   // https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=my_tool&email=my_email@example.com&ids=10.1093/nar/gks1195
030   private String mIDConvertBaseURL = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/";
031
032
033   // Ex: Science. 2002 Nov 8;298(5596):1248-51
034   private static final Pattern JOURNAL_CITATION_PATTERN = Pattern.compile("([^\\.]+)\\.\\s+(\\d{4})[^\\;]+\\;(\\d+)(?:\\((\\d+)\\))?\\:([\\d\\-]+)");
035
036   //---------------------------------------------------------------------------
037   public NCBI_eFetch(User inUser)
038   {
039      mUser = inUser;
040   }
041
042   //###########################################################################
043   // PUBLIC METHODS
044   //###########################################################################
045
046   //---------------------------------------------------------------------------
047   public String getBaseQueryURL()
048   {
049      return mBaseURL;
050   }
051
052   //---------------------------------------------------------------------------
053   public NCBI_eFetch setBaseQueryURL(String inValue)
054   {
055      mBaseURL = inValue;
056      return this;
057   }
058
059   //---------------------------------------------------------------------------
060   @Override
061   public MedlineCitation fetch(SeqCitation inQueryData)
062         throws IOException
063   {
064      if (! StringUtil.isSet(inQueryData.getPubMedId())
065          && StringUtil.isSet(inQueryData.getDOI()))
066      {
067         lookupPubMedIdFromDOI(inQueryData);
068      }
069
070      String url = composeQueryURL(inQueryData);
071      HttpURLConnection conn = HTTPUtil.openConnection(url);
072
073      MedlineCitation citation = null;
074
075      int responseCode = conn.getResponseCode();
076      if (200 == responseCode)
077      {
078         InputStream stream = new BufferedInputStream(conn.getInputStream());
079         XMLTag xmlTag = new XMLTag(stream);
080
081
082         XMLTag pubmedArticleTag = xmlTag.getRequiredSubtagByName(PubmedXML.PUBMED_ARTICLE);
083         XMLTag medlineCitationTag = pubmedArticleTag.getRequiredSubtagByName(PubmedXML.MEDLINE_CITATION);
084
085         citation = new MedlineCitation(medlineCitationTag);
086
087         // References
088         XMLTag pubmedDataTag = pubmedArticleTag.getOptionalSubtagByName(PubmedXML.PUBMED_DATA);
089         if (pubmedDataTag != null)
090         {
091            XMLTag refListTag = pubmedDataTag.getOptionalSubtagByName(PubmedXML.REFERENCE_LIST);
092            if (refListTag != null)
093            {
094               citation.setReferences(parseReferences(refListTag));
095            }
096         }
097      }
098
099      return citation;
100   }
101
102   //---------------------------------------------------------------------------
103   private String composeQueryURL(SeqCitation inQueryData)
104   {
105      StringBuilderPlus url = new StringBuilderPlus(getBaseQueryURL()).setDelimiter("&")
106            .append("?")
107            .append("tool=com_hfg")
108            .delimitedAppend("email=" + mUser.getEmail())
109            .delimitedAppend("db=pubmed")
110            .delimitedAppend("format=xml")
111            .delimitedAppend("id=" + inQueryData.getPubMedId());
112
113      return url.toString();
114   }
115
116   //---------------------------------------------------------------------------
117   private void lookupPubMedIdFromDOI(SeqCitation inQueryData)
118         throws IOException
119   {
120      StringBuilderPlus url = new StringBuilderPlus(mIDConvertBaseURL).setDelimiter("&")
121                  .append("?")
122                  .append("tool=com_hfg")
123                  .delimitedAppend("email=" + mUser.getEmail())
124                  .delimitedAppend("ids=" + inQueryData.getDOI());
125
126      HttpURLConnection conn = HTTPUtil.openConnection(url.toString());
127
128      InputStream stream = new BufferedInputStream(conn.getInputStream());
129
130      XMLTag xmlTag = new XMLTag(stream);
131
132      /*
133      Example response:
134
135      <pmcids status="ok">
136        <request idtype="doi" dois="" versions="yes" showaiid="no">
137          <echo>
138            tool=my_tool;email=my_email%40example.com;ids=10.1093%2Fnar%2Fgks1195
139          </echo>
140        </request>
141        <record requested-id="10.1093/NAR/GKS1195" pmcid="PMC3531190" pmid="23193287" doi="10.1093/nar/gks1195">
142          <versions>
143            <version pmcid="PMC3531190.1" current="true"/>
144          </versions>
145        </record>
146      </pmcids>
147
148       */
149      List<XMLTag> recordTags = xmlTag.getSubtagsByName("record");
150      if (CollectionUtil.hasValues(recordTags))
151      {
152         XMLTag recordTag = recordTags.get(0);
153         inQueryData.setPubMedId(recordTag.getAttributeValue("pmid"));
154      }
155   }
156
157   //---------------------------------------------------------------------------
158   private List<MedlineCitation> parseReferences(XMLTag inRefListTag)
159   {
160      List<MedlineCitation> references = null;
161
162      List<XMLTag> referenceTags = inRefListTag.getSubtagsByName(PubmedXML.REFERENCE);
163      if (CollectionUtil.hasValues(referenceTags))
164      {
165         references = new ArrayList<>(referenceTags.size());
166         for (XMLTag referenceTag : referenceTags)
167         {
168            MedlineCitation citation = new MedlineCitation();
169            references.add(citation);
170
171            XMLTag citationTag = referenceTag.getOptionalSubtagByName(PubmedXML.CITATION);
172            if (citationTag != null)
173            {
174               citation.setRawContent(citationTag.getContent().trim());
175
176               Matcher m = JOURNAL_CITATION_PATTERN.matcher(citation.toString());
177               if (m.matches())
178               {
179                  Journal journal = new Journal().setAbbrev(m.group(1));
180                  citation.setJournal(journal);
181
182                  citation.setYear(Integer.parseInt(m.group(2)));
183                  citation.setVolume(m.group(3));
184                  citation.setIssue(m.group(4));
185                  citation.setPages(m.group(5));
186               }
187            }
188
189            XMLTag articleIdListTag = referenceTag.getOptionalSubtagByName(PubmedXML.ARTICLE_ID_LIST);
190            if (articleIdListTag != null)
191            {
192               List<XMLTag> articleIdTags = referenceTag.getSubtagsByName(PubmedXML.ARTICLE_ID);
193               if (CollectionUtil.hasValues(articleIdTags))
194               {
195                  for (XMLTag articleIdTag : articleIdTags)
196                  {
197                     if (articleIdTag.getAttributeValue(PubmedXML.ID_TYPE_ATT).equalsIgnoreCase("pubmed"))
198                     {
199                        citation.setPubMedId(articleIdTag.getContent().trim());
200                     }
201                  }
202               }
203            }
204         }
205      }
206
207      return references;
208   }
209}