python获取DBLP数据集

#!/usr/bin/python
# -*- coding: UTF-8 -*-

import xml.sax
import io, sys

paper_tags = ('article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www')

sub_tags = ('publisher', 'journal', 'booktitle')

ret = []

class DBLPHandler(xml.sax.ContentHandler):

    def __init__(self):
        self.id = 1
        self.reset()

    def reset(self):
        self.dup_article = 0
        self.curtag = None
        self.author = ''
        self.title = ''
        self.pages = ''
        self.year = ''
        self.volume = ''
        self.journal = ''
        self.number = ''
        self.url = ''
        self.ee = ''

    def write_to_file(self, filename):
        file_object = file(filename, 'a+')
        for line in ret:
            file_object.write(line.encode('utf8'))
            #file_object.write('
')
        file_object.close()

    def record_row(self):
        ret.append(u''.join((self.author, self.title, self.year, self.pages, self.journal, self.ee, '
')).replace(' ', ''))
        #ret.append(self.author + self.title + self.year + self.pages+ self.journal + self.ee)
        #ret.append((self.author, self.title, self.year, self.pages, self.journal, self.ee))
        #print (self.author, self.title, self.year, self.pages)


    def startElement(self, tag, attributes):
        if tag != None and len(tag.strip()) > 0:
            if tag == 'article':
                self.dup_article += 1
            self.curtag = tag

    def endElement(self, tag):
        if tag != None and len(tag.strip()) > 0:
            if tag == 'article':
                self.record_row()
                self.reset()

    def characters(self, content):
        if content != '
':
            if self.curtag == "title":
                self.title = content.strip()
            elif self.curtag == "author":
                self.author = content.strip()
            elif self.curtag == "year":
                self.year = content.strip()
            elif self.curtag == "ee":
                self.ee = content.strip()
            elif self.curtag == "journal":
                self.journal = content.strip()
            elif self.curtag == "pages":
                self.pages = content.strip()
            elif self.url == "url":
                self.url = content.strip()
            elif self.number == "number":
                self.number = content.strip()
            elif self.number == "volume":
                self.volume = content.strip()

if (__name__ == "__main__"):
    filename = 'dblp.xml'
    if len(sys.argv) == 2:
        filename = sys.argv[1]
    # 创建一个 XMLReader
    parser = xml.sax.make_parser()
    # turn off namepsaces
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)

    # 重写 ContextHandler
    Handler = DBLPHandler()
    parser.setContentHandler(Handler)

    parser.parse(filename)
    print 'Parser Complete!'
    Handler.write_to_file('out')

另外附处理DNA数据的脚本程序:

lens_DNA = [0, 1000, 2000, 2500, 500, 1000, 1500, 2000, 2500]
lens_DBLP = [0, 40, 120, 200, 40, 80, 120, 160, 200]

file_id = 1
LINE_MAX = 100

class DNA_Handler:
    def __init__(self):
        self.strn = ''

    def write_to_file(self, filename):
        file_object = open(filename, 'a+')
        file_object.write(self.strn)
        file_object.close()

    def read_file(self, filename):
        fo = open(filename, 'r')
        line = fo.readline()
        self.strn = ''
        file_id = 1
        cnt_lines = 0
        while line and file_id < 9:
            line = line.replace('
', '')
            self.strn += line
            if len(self.strn) > lens_DNA[file_id]:
                self.strn = self.strn[0: lens_DNA[file_id]] + '
'
                print self.strn
                if file_id <= 3:
                    self.write_to_file('DNA_N' + str(file_id))
                else:
                    self.write_to_file('DNA_M' + str(file_id - 3))
                self.strn = ''
                cnt_lines += 1
                if cnt_lines >= LINE_MAX:
                    file_id += 1
                    cnt_lines = 0
            line = fo.readline()
        fo.close()
        print 'read_finished!'



class DBLP_Handler:

    def __init__(self):
        self.strn = ''

    def write_to_file(self, filename):
        file_object = open(filename, 'a+')
        file_object.write(self.strn)
        file_object.close()

    def read_file(self, filename):
        fo = open(filename, 'r')
        line = fo.readline()
        self.strn = ''
        file_id = 1
        cnt_lines = 0
        while line and file_id < 9:
            line = line.replace('
', '')
            self.strn += line
            if len(self.strn) > lens_DBLP[file_id]:
                self.strn = self.strn[0: lens_DBLP[file_id]] + '
'
                print self.strn
                self.write_to_file('DBLP_' + str(file_id))
                self.strn = ''
                cnt_lines += 1
                if cnt_lines >= LINE_MAX:
                    file_id += 1
                    cnt_lines = 0
            line = fo.readline()
        fo.close()
        print 'read_finished!'


if (__name__ == '__main__'):

    dh = DNA_Handler()
    dh.read_file('human_dna.fa')
    '''
    bblp_h = DBLP_Handler()
    bblp_h.read_file('DBLP_data')
    '''