jbei-seq format (.xml) sequence files

jbei-seq format sequence files:

jbei-seq format sequence files generally contain a single sequence each (although at times, you may see multiple ORF translated sequences embedded as feature annotations within a long DNA sequence). Feature annotations within jbei-seq format files are extremely useful for being able to view a DNA sequence at a higher/more functional level, and allow for rapidly checking if a designed DNA assembly process will result in the desired sequence.

Currently, the "seq:label" field for a given feature annotation is used to determine if two features (with the same name/label) should be spliced at a DNA assembly junction.

The jbei-seq format is based upon XML. Unlike Genbank format files, there is no requisite white space character counts for formatting purposes, and it is much easier for a computer program to parse.

Here is a stylized example jbei-seq format file:

<?xml version="1.0" encoding="UTF-8" ?>
<seq:seq
    xmlns:seq="http://jbei.org/sequence"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://jbei.org/sequence seq.xsd"
>

<seq:name>signal_pep</seq:name>
<seq:circular>false</seq:circular>
<seq:features>
    <seq:feature>
        <seq:type>CDS</seq:type>
        <seq:location>
            <seq:genbankStart>1</seq:genbankStart>
            <seq:end>63</seq:end>
        </seq:location>
        <seq:complement>false</seq:complement>
        <seq:label>signal_peptide</seq:label>
        <seq:attribute name="translation">GSKVYGKEQFLRMRQSMFPDR</seq:attribute>
    </seq:feature>
</seq:features>
<seq:sequence>GGCAGCAAGGTCTACGGCAAGGAACAGTTTTTGCGGATGCGCCAGAGCATGTTCCCCGATCGC</seq:sequence>
</seq:seq>

Here is the actual jbei-seq format sequence file: signal_peptide.xml

For the sake of comparison, here is the equivalent Genbank format file:

LOCUS       signal_pep                63 bp ds-DNA     linear       26-MAR-2011
ACCESSION   
VERSION     
KEYWORDS    .
FEATURES             Location/Qualifiers
     CDS             1..63
                     /label="signal_peptide"
                     /translation="GSKVYGKEQFLRMRQSMFPDR"
ORIGIN      
        1 ggcagcaagg tctacggcaa ggaacagttt ttgcggatgc gccagagcat gttccccgat
       61 cgc     
//

Here is the actual Genbank format file: signal_peptide.gb

Here is the stylized schema for the jbei-seq sequence format:

<?xml version="1.0" encoding="UTF-8"?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
    elementFormDefault="qualified"
    targetNamespace="http://jbei.org/sequence"
    xmlns:seq="http://jbei.org/sequence">
<xs:element name="seq">
    <xs:complexType>
        <xs:sequence>
            <xs:element ref="seq:name" />
            <xs:element ref="seq:circular" />
            <xs:element ref="seq:sequence" />
            <xs:element ref="seq:features" minOccurs="0" />
        </xs:sequence>
        <xs:attribute name="schemaVersion" type="xs:string" /> <!-- "2011.08.0" year.month.revision -->
    </xs:complexType>
</xs:element>
<xs:element name="features">
    <xs:complexType>
        <xs:sequence>
            <xs:element ref="seq:feature" maxOccurs="unbounded" />
        </xs:sequence>
    </xs:complexType>    
</xs:element>
<xs:element name="feature">
    <xs:complexType>
        <xs:sequence>
            <xs:element ref="seq:label"/>
            <xs:element ref="seq:complement" />
            <xs:element ref="seq:type" />
            <xs:element ref="seq:location" maxOccurs="unbounded" />
            <xs:element ref="seq:attribute" maxOccurs="unbounded"/>
            <xs:element ref="seq:seqHash" />
        </xs:sequence>
    </xs:complexType>
</xs:element>
<xs:element name="location">
    <xs:complexType>
        <xs:sequence>
            <xs:element ref="seq:genbankStart" />
            <xs:element ref="seq:end" />
        </xs:sequence>
        <xs:attribute name="siteInbetween" type="xs:boolean"/>
        <xs:attribute name="singleResidue" type="xs:boolean"/>
    </xs:complexType>
</xs:element>
<xs:element name="name" type="xs:string"/>
<xs:element name="circular" type="xs:boolean"/>
<xs:element name="genbankStart" type="xs:integer"/>
<xs:element name="end" type="xs:integer"/>
<xs:element name="type">
    <xs:simpleType>
        <xs:restriction base="xs:string">
            <xs:enumeration value="allele"/>
            <xs:enumeration value="attenuator"/>
            <xs:enumeration value="C_region"/>
            <xs:enumeration value="CAAT_signal"/>
            <xs:enumeration value="CDS"/>
            <xs:enumeration value="conflict"/>
            <xs:enumeration value="D-loop"/>
            <xs:enumeration value="D_segment"/>
            <xs:enumeration value="enhancer"/>
            <xs:enumeration value="exon"/>
            <xs:enumeration value="gene"/>
            <xs:enumeration value="GC_signal"/>
            <xs:enumeration value="iDNA"/>
            <xs:enumeration value="intron"/>
            <xs:enumeration value="J_region"/>
            <xs:enumeration value="LTR"/>
            <xs:enumeration value="mat_peptide"/>
            <xs:enumeration value="misc_binding"/>
            <xs:enumeration value="misc_difference"/>
            <xs:enumeration value="misc_feature"/>
            <xs:enumeration value="misc_recomb"/>
            <xs:enumeration value="misc_RNA"/>
            <xs:enumeration value="misc_signal"/>
            <xs:enumeration value="misc_structure"/>
            <xs:enumeration value="modified_base"/>
            <xs:enumeration value="mRNA"/>
            <xs:enumeration value="mutation"/>
            <xs:enumeration value="N_region"/>
            <xs:enumeration value="old_sequence"/>
            <xs:enumeration value="polyA_signal"/>
            <xs:enumeration value="polyA_site"/>
            <xs:enumeration value="precursor_RNA"/>
            <xs:enumeration value="prim_transcript"/>
            <xs:enumeration value="primer"/>
            <xs:enumeration value="primer_bind"/>
            <xs:enumeration value="promoter"/>
            <xs:enumeration value="protein_bind"/>
            <xs:enumeration value="RBS"/>
            <xs:enumeration value="rep_origin"/>
            <xs:enumeration value="repeat_region"/>
            <xs:enumeration value="repeat_unit"/>
            <xs:enumeration value="rRNA"/>
            <xs:enumeration value="S_region"/>
            <xs:enumeration value="satellite"/>
            <xs:enumeration value="scRNA"/>
            <xs:enumeration value="sig_peptide"/>
            <xs:enumeration value="snRNA"/>
            <xs:enumeration value="source"/>
            <xs:enumeration value="stem_loop"/>
            <xs:enumeration value="STS"/>
            <xs:enumeration value="TATA_signal"/>
            <xs:enumeration value="terminator"/>
            <xs:enumeration value="transit_peptide"/>
            <xs:enumeration value="transposon"/>
            <xs:enumeration value="tRNA"/>
            <xs:enumeration value="unsure"/>
            <xs:enumeration value="V_region"/>
            <xs:enumeration value="variation"/>
            <xs:enumeration value="-10_signal"/>
            <xs:enumeration value="-35_signal"/>
            <xs:enumeration value="3'clip"/>
            <xs:enumeration value="3'UTR"/>
            <xs:enumeration value="5'clip"/>
            <xs:enumeration value="5'UTR"/>
        </xs:restriction>
    </xs:simpleType>
</xs:element>
<xs:element name="label" type="xs:string"/>
<xs:element name="complement" type ="xs:boolean" />
<xs:element name="attribute">
    <xs:complexType>
        <xs:simpleContent>
            <xs:extension base="xs:string">
                <xs:attribute name="name"  use="required" type="xs:string"/>
                <xs:attribute name="quoted" type="xs:boolean"/>    
            </xs:extension>
        </xs:simpleContent>
    </xs:complexType>
</xs:element>
<xs:element name="seqHash" type="xs:string" />
<xs:element name="sequence" type="xs:string"/>
</xs:schema>

Here is the actual schema for the jbei-seq format sequence file: seq.xsd