001package com.hfg.bio.seq.format.feature.genbank;
002
003import com.hfg.bio.seq.format.GenBank;
004import com.hfg.bio.seq.format.feature.FeatureKey;
005import com.hfg.util.StringUtil;
006
007import java.util.Collection;
008import java.util.HashMap;
009import java.util.Map;
010
011//------------------------------------------------------------------------------
012/**
013 DDBJ/EMBL/GenBank feature table keys for flat-file records.
014 <p>
015 See <a href='http://www.insdc.org/documents/feature-table'>http://www.insdc.org/documents/feature-table</a>
016 </p>
017 @author J. Alex Taylor, hairyfatguy.com
018 */
019//------------------------------------------------------------------------------
020// com.hfg Library
021//
022// This library is free software; you can redistribute it and/or
023// modify it under the terms of the GNU Lesser General Public
024// License as published by the Free Software Foundation; either
025// version 2.1 of the License, or (at your option) any later version.
026//
027// This library is distributed in the hope that it will be useful,
028// but WITHOUT ANY WARRANTY; without even the implied warranty of
029// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
030// Lesser General Public License for more details.
031//
032// You should have received a copy of the GNU Lesser General Public
033// License along with this library; if not, write to the Free Software
034// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
035//
036// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
037// jataylor@hairyfatguy.com
038//------------------------------------------------------------------------------
039
040public class GenBankFeatureKey implements FeatureKey
041{
042   private String mName;
043   private String mDescription;
044
045   private static Map<String, GenBankFeatureKey> sUniqueMap = new HashMap<String, GenBankFeatureKey>();
046
047   public static final GenBankFeatureKey assembly_gap = new GenBankFeatureKey("assembly_gap")
048         .setDescription("gap between two components of a genome or transcriptome assembly");
049
050   public static final GenBankFeatureKey attenuator   = new GenBankFeatureKey("attenuator")
051         .setDescription("1) region of DNA at which regulation of termination of transcription occurs, which controls the expression of some bacterial operons;\n" +
052                         "2) sequence segment located between the promoter and the first structural gene that causes partial termination of transcription");
053
054   public static final GenBankFeatureKey C_region     = new GenBankFeatureKey("C_region")
055         .setDescription("constant region of immunoglobulin light and heavy chains, and T-cell receptor alpha, beta, and gamma chains; includes one or more exons depending on the particular chain");
056
057   public static final GenBankFeatureKey CAAT_signal  = new GenBankFeatureKey("CAAT_signal")
058         .setDescription("CAAT box; part of a conserved sequence located about 75 bp up-stream of the start point of eukaryotic transcription units which may be involved in RNA polymerase binding; consensus=GG(C or T)CAATCT [1,2].");
059
060   public static final GenBankFeatureKey CDS          = new GenBankFeatureKey("CDS")
061         .setDescription("coding sequence; sequence of nucleotides that corresponds with the sequence of amino acids in a protein (location includes stop codon); feature includes amino acid conceptual translation.");
062
063   public static final GenBankFeatureKey centromere   = new GenBankFeatureKey("centromere")
064         .setDescription("region of biological interest identified as a centromere and which has been experimentally characterized;");
065
066   public static final GenBankFeatureKey conflict   = new GenBankFeatureKey("conflict")
067         .setDescription("Not found in documentation");
068
069   public static final GenBankFeatureKey D_loop       = new GenBankFeatureKey("D-loop")
070         .setDescription("displacement loop; a region within mitochondrial DNA in which a short stretch of RNA is paired with one strand of DNA, displacing the original partner DNA strand in this region; also used to describe the displacement of a region of one strand of duplex DNA by a single stranded invader in the reaction catalyzed by RecA protein");
071
072   public static final GenBankFeatureKey D_segment    = new GenBankFeatureKey("D_segment")
073         .setDescription("Diversity segment of immunoglobulin heavy chain, and T-cell receptor beta chain;");
074
075   public static final GenBankFeatureKey enhancer     = new GenBankFeatureKey("enhancer")
076         .setDescription("a cis-acting sequence that increases the utilization of (some) eukaryotic promoters, and can function in either orientation and in any location (upstream or downstream) relative to the promoter;");
077
078   public static final GenBankFeatureKey exon         = new GenBankFeatureKey("exon")
079         .setDescription("region of genome that codes for portion of spliced mRNA, rRNA and tRNA; may contain 5'UTR, all CDSs and 3' UTR;");
080
081   public static final GenBankFeatureKey gap          = new GenBankFeatureKey("gap")
082         .setDescription("gap in the sequence");
083
084   public static final GenBankFeatureKey GC_signal    = new GenBankFeatureKey("GC_signal")
085         .setDescription("GC box; a conserved GC-rich region located upstream of the start point of eukaryotic transcription units which may occur in multiple copies or in either orientation; consensus=GGGCGG;");
086
087   public static final GenBankFeatureKey gene         = new GenBankFeatureKey("gene")
088         .setDescription("region of biological interest identified as a gene and for which a name has been assigned;");
089
090   public static final GenBankFeatureKey iDNA         = new GenBankFeatureKey("iDNA")
091         .setDescription("intervening DNA; DNA which is eliminated through any of several kinds of recombination;");
092
093   public static final GenBankFeatureKey intron       = new GenBankFeatureKey("intron")
094         .setDescription("a segment of DNA that is transcribed, but removed from within the transcript by splicing together the sequences (exons) on either side of it;");
095
096   public static final GenBankFeatureKey J_segment    = new GenBankFeatureKey("J_segment")
097         .setDescription("joining segment of immunoglobulin light and heavy chains, and T-cell receptor alpha, beta, and gamma chains; ");
098
099   public static final GenBankFeatureKey LTR          = new GenBankFeatureKey("LTR")
100         .setDescription("long terminal repeat, a sequence directly repeated at both ends of a defined sequence, of the sort typically found in retroviruses;");
101
102   public static final GenBankFeatureKey mat_peptide  = new GenBankFeatureKey("mat_peptide")
103         .setDescription("mature peptide or protein coding sequence; coding sequence for the mature or final peptide or protein product following post-translational modification; the location does not include the stop codon (unlike the corresponding CDS);");
104
105   public static final GenBankFeatureKey misc_binding = new GenBankFeatureKey("misc_binding")
106         .setDescription("site in nucleic acid which covalently or non-covalently binds another moiety that cannot be described by any other binding key (primer_bind or protein_bind);");
107
108   public static final GenBankFeatureKey misc_difference = new GenBankFeatureKey("misc_difference")
109         .setDescription("feature sequence is different from that presented in the entry and cannot be described by any other Difference key (unsure, old_sequence, variation, or modified_base);");
110
111   public static final GenBankFeatureKey misc_feature = new GenBankFeatureKey("misc_feature")
112         .setDescription("region of biological interest which cannot be described by any other feature key; a new or rare feature;");
113
114   public static final GenBankFeatureKey misc_recomb  = new GenBankFeatureKey("misc_recomb")
115         .setDescription("site of any generalized, site-specific or replicative recombination event where there is a breakage and reunion of duplex DNA that cannot be described by other recombination keys or qualifiers of source key (/proviral);");
116
117   public static final GenBankFeatureKey misc_RNA     = new GenBankFeatureKey("misc_RNA")
118         .setDescription("any transcript or RNA product that cannot be defined by other RNA keys (prim_transcript, precursor_RNA, mRNA, 5'UTR, 3'UTR, exon, CDS, sig_peptide, transit_peptide, mat_peptide, intron, polyA_site, ncRNA, rRNA and tRNA);");
119
120   public static final GenBankFeatureKey misc_signal  = new GenBankFeatureKey("misc_signal")
121         .setDescription("any region containing a signal controlling or altering gene function or expression that cannot be described by other signal keys (promoter, CAAT_signal, TATA_signal, -35_signal, -10_signal, GC_signal, RBS, polyA_signal, enhancer, attenuator, terminator, and rep_origin).");
122
123   public static final GenBankFeatureKey misc_structure = new GenBankFeatureKey("misc_structure")
124         .setDescription("any secondary or tertiary nucleotide structure or conformation that cannot be described by other Structure keys (stem_loop and D-loop);");
125
126   public static final GenBankFeatureKey mobile_element = new GenBankFeatureKey("mobile_element")
127         .setDescription("region of genome containing mobile elements;");
128
129   public static final GenBankFeatureKey modified_base  = new GenBankFeatureKey("modified_base")
130         .setDescription("the indicated nucleotide is a modified nucleotide and should be substituted for by the indicated molecule (given in the mod_base qualifier value)");
131
132   public static final GenBankFeatureKey mRNA           = new GenBankFeatureKey("mRNA")
133         .setDescription("messenger RNA; includes 5'untranslated region (5'UTR), coding sequences (CDS, exon) and 3'untranslated region (3'UTR);");
134
135   public static final GenBankFeatureKey ncRNA          = new GenBankFeatureKey("ncRNA")
136         .setDescription("a non-protein-coding gene, other than ribosomal RNA and transfer RNA, the functional molecule of which is the RNA transcript;");
137
138   public static final GenBankFeatureKey N_region       = new GenBankFeatureKey("N_region")
139         .setDescription("extra nucleotides inserted between rearranged immunoglobulin segments.");
140
141   public static final GenBankFeatureKey old_sequence   = new GenBankFeatureKey("old_sequence")
142         .setDescription("the presented sequence revises a previous version of the sequence at this location;");
143
144   public static final GenBankFeatureKey operon         = new GenBankFeatureKey("operon")
145         .setDescription("region containing polycistronic transcript including a cluster of genes that are under the control of the same regulatory sequences/promotor and in the same biological pathway");
146
147   public static final GenBankFeatureKey oriT            = new GenBankFeatureKey("oriT")
148         .setDescription("origin of transfer; region of a DNA molecule where transfer is initiated during the process of conjugation or mobilization");
149
150   public static final GenBankFeatureKey polyA_signal    = new GenBankFeatureKey("polyA_signal")
151         .setDescription("recognition region necessary for endonuclease cleavage of an RNA transcript that is followed by polyadenylation; consensus=AATAAA;");
152
153   public static final GenBankFeatureKey polyA_site      = new GenBankFeatureKey("polyA_site")
154         .setDescription("site on an RNA transcript to which will be added adenine residues by post-transcriptional polyadenylation;");
155
156   public static final GenBankFeatureKey precursor_RNA   = new GenBankFeatureKey("precursor_RNA")
157         .setDescription("any RNA species that is not yet the mature RNA product; may include 5' untranslated region (5'UTR), coding sequences (CDS, exon), intervening sequences (intron) and 3' untranslated region (3'UTR);");
158
159   public static final GenBankFeatureKey prim_transcript = new GenBankFeatureKey("prim_transcript")
160         .setDescription("primary (initial, unprocessed) transcript;  includes 5' untranslated region (5'UTR), coding sequences (CDS, exon), intervening sequences (intron) and 3' untranslated region (3'UTR);");
161
162   public static final GenBankFeatureKey primer_bind     = new GenBankFeatureKey("primer_bind")
163         .setDescription("non-covalent primer binding site for initiation of replication, transcription, or reverse transcription; includes site(s) for synthetic e.g., PCR primer elements;");
164
165   public static final GenBankFeatureKey promoter        = new GenBankFeatureKey("promoter")
166         .setDescription("region on a DNA molecule involved in RNA polymerase binding to initiate transcription;");
167
168   public static final GenBankFeatureKey protein_bind    = new GenBankFeatureKey("protein_bind")
169         .setDescription("non-covalent protein binding site on nucleic acid;");
170
171   public static final GenBankFeatureKey RBS             = new GenBankFeatureKey("RBS")
172         .setDescription("ribosome binding site;");
173
174   public static final GenBankFeatureKey regulatory      = new GenBankFeatureKey("regulatory")
175         .setDescription("region involved in regulation of expression; Not found in documentation");
176
177   public static final GenBankFeatureKey repeat_region   = new GenBankFeatureKey("repeat_region")
178         .setDescription("region of genome containing repeating units;");
179
180   public static final GenBankFeatureKey rep_origin      = new GenBankFeatureKey("rep_origin")
181         .setDescription("origin of replication; starting site for duplication of nucleic acid to give two identical copies;");
182
183   public static final GenBankFeatureKey rRNA            = new GenBankFeatureKey("rRNA")
184         .setDescription("mature ribosomal RNA; RNA component of the ribonucleoprotein particle (ribosome) which assembles amino acids into proteins.");
185
186   public static final GenBankFeatureKey S_region        = new GenBankFeatureKey("S_region")
187         .setDescription("switch region of immunoglobulin heavy chains; involved in the rearrangement of heavy chain DNA leading to the expression of a different immunoglobulin class from the same B-cell;");
188
189   public static final GenBankFeatureKey sig_peptide     = new GenBankFeatureKey("sig_peptide")
190         .setDescription("signal peptide coding sequence; coding sequence for an N-terminal domain of a secreted protein; this domain is involved in attaching nascent polypeptide to the membrane leader sequence;");
191
192   public static final GenBankFeatureKey source          = new GenBankFeatureKey("source")
193         .setDescription("identifies the biological source of the specified span of the sequence; this key is mandatory; more than one source key per sequence is allowed; every entry/record will have, as a minimum, either a single source key spanning the entire sequence or multiple source keys, which together, span the entire sequence.");
194
195   public static final GenBankFeatureKey stem_loop       = new GenBankFeatureKey("stem_loop")
196         .setDescription("hairpin; a double-helical region formed by base-pairing between adjacent (inverted) complementary sequences in a single strand of RNA or DNA. ");
197
198   public static final GenBankFeatureKey STS             = new GenBankFeatureKey("STS")
199         .setDescription("sequence tagged site; short, single-copy DNA sequence that characterizes a mapping landmark on the genome andcan be detected by PCR; a region of the genome can bemapped by determining the order of a series of STSs;");
200
201   public static final GenBankFeatureKey TATA_signal     = new GenBankFeatureKey("TATA_signal")
202         .setDescription("TATA box; Goldberg-Hogness box; a conserved AT-rich septamer found about 25 bp before the start point of each eukaryotic RNA polymerase II transcript unit which may be involved in positioning the enzyme  for correct initiation; consensus=TATA(A or T)A(A or T)");
203
204   public static final GenBankFeatureKey telomere        = new GenBankFeatureKey("telomere")
205         .setDescription("region of biological interest identified as a telomere and which has been experimentally characterized;");
206
207   public static final GenBankFeatureKey terminator      = new GenBankFeatureKey("terminator")
208         .setDescription("sequence of DNA located either at the end of the ranscript that causes RNA polymerase to terminate transcription;");
209
210   public static final GenBankFeatureKey tmRNA           = new GenBankFeatureKey("tmRNA")
211         .setDescription("transfer messenger RNA; tmRNA acts as a tRNA first, and then as an mRNA that encodes a peptide tag; the ribosome translates this mRNA region of tmRNA and attaches the encoded peptide tag to the C-terminus of the unfinished protein; this attached tag targets the protein for destruction or proteolysis;");
212
213   public static final GenBankFeatureKey transit_peptide = new GenBankFeatureKey("transit_peptide")
214         .setDescription("transit peptide coding sequence; coding sequence for an N-terminal domain of a nuclear-encoded organellar protein; this domain is involved in post-translational import of the protein into the organelle;");
215
216   public static final GenBankFeatureKey tRNA            = new GenBankFeatureKey("tRNA")
217         .setDescription("mature transfer RNA, a small RNA molecule (75-85 bases long) that mediates the translation of a nucleic acid sequence into an amino acid sequence;");
218
219   public static final GenBankFeatureKey unsure          = new GenBankFeatureKey("unsure")
220         .setDescription("author is unsure of exact sequence in this region;");
221
222   public static final GenBankFeatureKey V_region        = new GenBankFeatureKey("V_region")
223         .setDescription("variable region of immunoglobulin light and heavy chains, and T-cell receptor alpha, beta, and gamma chains;  codes for the variable amino terminal portion; can be composed of V_segments, D_segments, N_regions, and J_segments;");
224
225   public static final GenBankFeatureKey V_segment       = new GenBankFeatureKey("V_segment")
226         .setDescription("variable segment of immunoglobulin light and heavy chains, and T-cell receptor alpha, beta, and gamma chains; codes for most of the variable region (V_region) and the last few amino acids of the leader peptide;");
227
228   public static final GenBankFeatureKey variation       = new GenBankFeatureKey("variation")
229         .setDescription("a related strain contains stable mutations from the same gene (e.g., RFLPs, polymorphisms, etc.) which differ from the presented sequence at this location (and possibly others);");
230
231   public static final GenBankFeatureKey ThreePrime_UTR = new GenBankFeatureKey("3'UTR")
232         .setDescription("1) region at the 3' end of a mature transcript (following the stop codon) that is not translated into a protein;\n" +
233                         "2) region at the 3' end of an RNA virus (following the last stop codon) that is not translated into a protein;");
234
235   public static final GenBankFeatureKey FivePrime_UTR = new GenBankFeatureKey("5'UTR")
236         .setDescription("1) region at the 5' end of a mature transcript (preceding the initiation codon) that is not translated into a protein;\n" +
237                         "2) region at the 5' end of an RNA virus genome (preceding the first initiation codon) that is not translated into a protein;");
238
239   public static final GenBankFeatureKey Minus10_signal = new GenBankFeatureKey("-10_signal")
240         .setDescription("Pribnow box; a conserved region about 10 bp upstream of the start point of bacterial transcription units which may be involved in binding RNA polymerase; consensus=TAtAaT");
241
242   public static final GenBankFeatureKey Minus35_signal = new GenBankFeatureKey("-35_signal")
243         .setDescription("a conserved hexamer about 35 bp upstream of the start point of bacterial transcription units; consensus=TTGACa or TGTTGACA;");
244
245   //###########################################################################
246   // CONSTRUCTORS
247   //###########################################################################
248
249   //---------------------------------------------------------------------------
250   private GenBankFeatureKey(String inName)
251   {
252      mName = inName;
253      sUniqueMap.put(mName, this);
254   }
255
256   //###########################################################################
257   // PUBLIC METHODS
258   //###########################################################################
259
260   //---------------------------------------------------------------------------
261   public static GenBankFeatureKey valueOf(String inValue)
262   {
263      GenBankFeatureKey key = sUniqueMap.get(inValue);
264      if (null == key)
265      {
266         key = new GenBankFeatureKey(inValue);
267         GenBank.getLogger().warning(StringUtil.singleQuote(inValue) + " is not a recognized GenBank feature key!");
268      }
269
270      return key;
271   }
272
273   //---------------------------------------------------------------------------
274   public static Collection<GenBankFeatureKey> values()
275   {
276      return sUniqueMap.values();
277   }
278
279   //---------------------------------------------------------------------------
280   public String name()
281   {
282      return mName;
283   }
284
285   //---------------------------------------------------------------------------
286   @Override
287   public String toString()
288   {
289      return name();
290   }
291
292   //---------------------------------------------------------------------------
293   private GenBankFeatureKey setDescription(String inValue)
294   {
295      mDescription = inValue;
296      return this;
297   }
298
299   //---------------------------------------------------------------------------
300   public String getDescription()
301   {
302      return mDescription;
303   }
304
305}