Examples of reading / writing sequence formats via com-hfg objects.

The com.hfg.bio.seq.format package contains classes for reading or writing protein or nucleotide sequence objects in various formats. Readable formats implement ReadableSeqFormat and writable formats implement WritableSeqFormat. [Note that this package is still a work in progress so not all formats are both readable and writable. Avaialble formats are FASTA, FASTQ, GenBank, and EMBL/UniProt. EMBL/UniProt currently only supports reading.]

  1. Reading protein sequences from a FASTA-format file
  2. Writing protein sequences to a FASTA-format file
  3. Reading a nucleotide sequence from a GenBank-format file

Example 1: Reading protein sequences from a FASTA-format file

  1. File seqFile = new File("test_seqs.fasta");
  2.  
  3. BufferedReader reader = new BufferedReader(new FileReader(seqFile));
  4.  
  5. FASTA<Protein> fastaObj = new FASTA<>(new ProteinFactory());
  6.  
  7. BufferedSeqReader<Protein> seqReader = new BufferedSeqReader<Protein>(reader, fastaObj);
  8.  
  9. List<Protein> seqs = seqReader.readAll();
  10.  
  11. seqReader.close();

Example 2: Writing protein sequences to a FASTA-format file

  1. FASTA<Protein> fastaObj = new FASTA<>();
  2.  
  3. FileWriter writer = new FileWriter("write_seqs_test.fasta");
  4. for (Protein protein : seqs)
  5. {
  6. fastaObj.write(protein, writer);
  7. }
  8.  
  9. writer.close();

Example 3: Reading a nucleotide sequence from a GenBank-format file

  1. File seqFile = new File("test_seq.gb");
  2.  
  3. BufferedReader reader = new BufferedReader(new FileReader(seqFile));
  4.  
  5. GenBank<NucleicAcid> formatObj = new GenBank<>(new NucleicAcidFactory());
  6.  
  7. BufferedSeqReader<NucleicAcid> seqReader = new BufferedSeqReader<>(reader, formatObj);
  8.  
  9. List<NucleicAcid> seqs = new ArrayList<>();
  10. int seqCount = 0;
  11. while (seqReader.hasNext())
  12. {
  13. seqs.add(seqReader.next());
  14. seqCount++;
  15. }
  16.  
  17. seqReader.close();
  18.  
  19. NucleicAcid seq = seqs.get(0);
  20.  
  21. Assert.assertEquals("R88064.1", seq.getID());
  22. Assert.assertEquals(MolType.mRNA, seq.getMolType());
  23. Assert.assertEquals(SeqTopology.LINEAR, seq.getSeqTopology());
  24. Assert.assertEquals(NCBIGenBankDivision.EST, seq.getSeqRepositoryDivision());
  25. Assert.assertEquals("ym87c11.r1 Soares adult brain N2b4HB55Y Homo sapiens cDNA clone IMAGE:165908 5', mRNA sequence", seq.getDescription());
  26. Assert.assertEquals(460, seq.length());
  27. Assert.assertEquals("Homo sapiens", seq.getNCBITaxon().getScientificName());
  28.  
  29. Assert.assertEquals(1, seq.getReferences().size());
  30.  
  31. SeqCitation reference = seq.getReferences().get(0);
  32. Assert.assertEquals("The WashU-Merck EST Project", reference.getTitle());
  33. Assert.assertEquals("Unpublished", reference.toString());
  34. Assert.assertEquals(21, reference.getAuthors().size());
  35. Assert.assertEquals("Hillier,L., Clark,N., Dubuque,T., Elliston,K., Hawkins,M., Holman,M., Hultman,M., Kucaba,T., Le,M., Lennon,G., Marra,M., Parsons,J., Rifkin,L., Rohlfing,T., Soares,M., Tan,F., Trevaskis,E., Waterston,R., Williamson,A., Wohldmann,P., Wilson,R.", StringUtil.join(reference.getAuthors(), ", "));
  36.  
  37. List<SeqFeature> cdsFeatures = seq.getFeatures(GenBankFeatureKey.CDS);

Return to Main Page