XML parsing per record

Willem Ligtenberg WLigtenberg at gmail.com
Wed Apr 20 06:08:36 EDT 2005


On Sun, 17 Apr 2005 02:16:04 +0000, William Park wrote:

> Willem Ligtenberg <WLigtenberg at gmail.com> wrote:
>> I want to parse a very large (2.4 gig) XML file (bioinformatics
>> ofcourse :)) But I have no clue how to do that. Most things I see read
>> the entire xml file at once. That isn't going to work here ofcourse.
>> 
>> So I would like to parse a XML file one record at a time and then be
>> able to store the information in another object.  How should I do
>> that?
>> 
>> Thanks in advance,
>> 
>> Willem Ligtenberg A total newbie to python by the way.
> 
> You may want to try Expat (www.libexpat.org) or Python wrapper to it.
> You can feed small piece at a time, say by lines or whatever.  Of
> course, it all depends on what kind of parsing you have in mind. :-)
> 
> Care to post more details?

The XML file I need to parse contains information about genes.
So the first element is a gene and then there are a lot sub-elements with
sub-elements. I only need some of the informtion and want to store it in
my an object called gene. Lateron this information will be printed into a
file, which in it's turn will be fed into some other program.
This is an example of the XML
<?xml version="1.0"?>
<!DOCTYPE Entrezgene-Set PUBLIC "-//NCBI//NCBI Entrezgene/EN" "NCBI_Entrezgene.dtd">
<Entrezgene-Set>
  <Entrezgene>
    <Entrezgene_track-info>
      <Gene-track>
        <Gene-track_geneid>9996</Gene-track_geneid>
        <Gene-track_status value="secondary">1</Gene-track_status>
        <Gene-track_current-id>
          <Dbtag>
            <Dbtag_db>LocusID</Dbtag_db>
            <Dbtag_tag>
              <Object-id>
                <Object-id_id>320632</Object-id_id>
              </Object-id>
            </Dbtag_tag>
          </Dbtag>
          <Dbtag>
            <Dbtag_db>GeneID</Dbtag_db>
            <Dbtag_tag>
              <Object-id>
                <Object-id_id>320632</Object-id_id>
              </Object-id>
            </Dbtag_tag>
          </Dbtag>
        </Gene-track_current-id>
        <Gene-track_create-date>
          <Date>
            <Date_std>
              <Date-std>
                <Date-std_year>2003</Date-std_year>
                <Date-std_month>8</Date-std_month>
                <Date-std_day>28</Date-std_day>
                <Date-std_hour>21</Date-std_hour>
                <Date-std_minute>39</Date-std_minute>
                <Date-std_second>0</Date-std_second>
              </Date-std>
            </Date_std>
          </Date>
        </Gene-track_create-date>
        <Gene-track_update-date>
          <Date>
            <Date_std>
              <Date-std>
                <Date-std_year>2005</Date-std_year>
                <Date-std_month>2</Date-std_month>
                <Date-std_day>17</Date-std_day>
                <Date-std_hour>12</Date-std_hour>
                <Date-std_minute>54</Date-std_minute>
                <Date-std_second>0</Date-std_second>
              </Date-std>
            </Date_std>
          </Date>
        </Gene-track_update-date>
      </Gene-track>
    </Entrezgene_track-info>
    <Entrezgene_type value="protein-coding">6</Entrezgene_type>
    <Entrezgene_source>
      <BioSource>
        <BioSource_genome value="genomic">1</BioSource_genome>
        <BioSource_origin value="natural">1</BioSource_origin>
        <BioSource_org>
          <Org-ref>
            <Org-ref_taxname>Mus musculus</Org-ref_taxname>
            <Org-ref_common>house mouse</Org-ref_common>
            <Org-ref_db>
              <Dbtag>
                <Dbtag_db>taxon</Dbtag_db>
                <Dbtag_tag>
                  <Object-id>
                    <Object-id_id>10090</Object-id_id>
                  </Object-id>
                </Dbtag_tag>
              </Dbtag>
            </Org-ref_db>
            <Org-ref_syn>
              <Org-ref_syn_E>mouse</Org-ref_syn_E>
            </Org-ref_syn>
            <Org-ref_orgname>
              <OrgName>
                <OrgName_name>
                  <OrgName_name_binomial>
                    <BinomialOrgName>
                      <BinomialOrgName_genus>Mus</BinomialOrgName_genus>
                      <BinomialOrgName_species>musculus</BinomialOrgName_species>
                    </BinomialOrgName>
                  </OrgName_name_binomial>
                </OrgName_name>
                <OrgName_lineage>Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Euarchontoglires; Glires; Rodentia; Sciurognathi; Muridae; Murinae; Mus</OrgName_lineage>
                <OrgName_gcode>1</OrgName_gcode>
                <OrgName_mgcode>2</OrgName_mgcode>
                <OrgName_div>ROD</OrgName_div>
              </OrgName>
            </Org-ref_orgname>
          </Org-ref>
        </BioSource_org>
      </BioSource>
    </Entrezgene_source>
    <Entrezgene_gene>
      <Gene-ref>
      </Gene-ref>
    </Entrezgene_gene>
    <Entrezgene_gene-source>
      <Gene-source>
        <Gene-source_src>LocusLink</Gene-source_src>
        <Gene-source_src-int>9996</Gene-source_src-int>
        <Gene-source_src-str2>9996</Gene-source_src-str2>
        <Gene-source_gene-display value="false"/>
        <Gene-source_locus-display value="false"/>
        <Gene-source_extra-terms value="false"/>
      </Gene-source>
    </Entrezgene_gene-source>
    <Entrezgene_locus>
      <Gene-commentary>
        <Gene-commentary_type value="genomic">1</Gene-commentary_type>
        <Gene-commentary_version>0</Gene-commentary_version>
      </Gene-commentary>
    </Entrezgene_locus>
    <Entrezgene_unique-keys>
      <Dbtag>
        <Dbtag_db>LocusID</Dbtag_db>
        <Dbtag_tag>
          <Object-id>
            <Object-id_id>9996</Object-id_id>
          </Object-id>
        </Dbtag_tag>
      </Dbtag>
    </Entrezgene_unique-keys>
    <Entrezgene_xtra-index-terms>
      <Entrezgene_xtra-index-terms_E>LOC320632</Entrezgene_xtra-index-terms_E>
    </Entrezgene_xtra-index-terms>
  </Entrezgene>
</Entrezgene-Set>




More information about the Python-list mailing list