Examples of reading / writing sequence formats via com-hfg objects.

The com.hfg.bio.seq.format package contains classes for reading or writing protein or nucleotide sequence objects in various formats. Readable formats implement ReadableSeqFormat and writable formats implement WritableSeqFormat. [Note that this package is still a work in progress so not all formats are both readable and writable. Avaialble formats are FASTA, FASTQ, GenBank, and EMBL/UniProt. EMBL/UniProt currently only supports reading.]

  1. Reading protein sequences from a FASTA-format file
  2. Writing protein sequences to a FASTA-format file
  3. Reading a nucleotide sequence from a GenBank-format file

Example 1: Reading protein sequences from a FASTA-format file

    File seqFile = new File("test_seqs.fasta");

    BufferedReader reader = new BufferedReader(new FileReader(seqFile));

    FASTA<Protein> fastaObj = new FASTA<>(new ProteinFactory());

    BufferedSeqReader<Protein> seqReader = new BufferedSeqReader<Protein>(reader, fastaObj);

    List<Protein> seqs = seqReader.readAll();

    seqReader.close();

Example 2: Writing protein sequences to a FASTA-format file

    FASTA<Protein> fastaObj = new FASTA<>();

    FileWriter writer = new FileWriter("write_seqs_test.fasta");
    for (Protein protein : seqs)
    {
       fastaObj.write(protein, writer);
    }

    writer.close();

Example 3: Reading a nucleotide sequence from a GenBank-format file

    File seqFile = new File("test_seq.gb");

    BufferedReader reader = new BufferedReader(new FileReader(seqFile));

    GenBank<NucleicAcid> formatObj = new GenBank<>(new NucleicAcidFactory());

    BufferedSeqReader<NucleicAcid> seqReader = new BufferedSeqReader<>(reader, formatObj);

    List<NucleicAcid> seqs = new ArrayList<>();
    int seqCount = 0;
    while (seqReader.hasNext())
    {
       seqs.add(seqReader.next());
       seqCount++;
    }

    seqReader.close();

    NucleicAcid seq = seqs.get(0);

    Assert.assertEquals("R88064.1", seq.getID());
    Assert.assertEquals(MolType.mRNA, seq.getMolType());
    Assert.assertEquals(SeqTopology.LINEAR, seq.getSeqTopology());
    Assert.assertEquals(NCBIGenBankDivision.EST, seq.getSeqRepositoryDivision());
    Assert.assertEquals("ym87c11.r1 Soares adult brain N2b4HB55Y Homo sapiens cDNA clone IMAGE:165908 5', mRNA sequence", seq.getDescription());
    Assert.assertEquals(460, seq.length());
    Assert.assertEquals("Homo sapiens", seq.getNCBITaxon().getScientificName());

    Assert.assertEquals(1, seq.getReferences().size());

    SeqCitation reference = seq.getReferences().get(0);
    Assert.assertEquals("The WashU-Merck EST Project", reference.getTitle());
    Assert.assertEquals("Unpublished", reference.toString());
    Assert.assertEquals(21, reference.getAuthors().size());
    Assert.assertEquals("Hillier,L., Clark,N., Dubuque,T., Elliston,K., Hawkins,M., Holman,M., Hultman,M., Kucaba,T., Le,M., Lennon,G., Marra,M., Parsons,J., Rifkin,L., Rohlfing,T., Soares,M., Tan,F., Trevaskis,E., Waterston,R., Williamson,A., Wohldmann,P., Wilson,R.", StringUtil.join(reference.getAuthors(), ", "));

    List<SeqFeature> cdsFeatures = seq.getFeatures(GenBankFeatureKey.CDS);

Return to Main Page