#!/usr/bin/python #------------------------------------------------------------------------------ # DblpSaxDH.py: DBLP XML file data extractor # # by Vladimir Batagelj, April 2006 # # typical usage: # import DblpSaxDH # DblpSaxDH.test('D:\\vlado\\docs\\Papers\\2006\\SS05apr06\\DBLP\\','dblpT.xml') #------------------------------------------------------------------------------ # based on # http://www.rexx.com/~dkuhlman/pyxmlfaq.html#howsax # http://www.rexx.com/~dkuhlman/pyxmlfaq.html#howsaxhandler #------------------------------------------------------------------------------ import sys, string, time, xml.sax from xml.sax import handler from xml.sax import saxexts from string import strip, join class DblpSaxDocumentHandler(handler.ContentHandler): def __init__(self, outfile): self.outfile = outfile self.inContent = 0 self.contentData = [] self.key = '' self.year = '' self.title = '' self.authors = [] self.type = 0 self.num = 0 def startDocument(self): print "-------- Document Start --------" self.outfile.write("S\n") def endDocument(self): print "-------- Document End ----------" print self.num, ' records read' self.outfile.write("E\n") def startElement(self, name, attrs): global pubs try: i = pubs.index(name)+1 except ValueError: i = 0 if i > 0: self.outfile.write('[\n') self.type = i; self.num += 1 for attrName in attrs.keys(): if attrName == 'key': self.key = attrs.get(attrName) def endElement(self, name): global pubs try: i = pubs.index(name)+1 except ValueError: i = 0 if i > 0: if self.authors != []: desc = [ self.key, self.year, self.title, self.authors, self.type ] self.outfile.write('K %s\nY %s\nT %s\nW %s\n' % (self.key, self.year, self.title, self.type)) for author in self.authors: self.outfile.write('A %s\n' % author) self.key = self.year = self.title = '' self.authors = []; self.type = 0 else: self.outfile.write("I\n") self.outfile.write(']\n') elif name == 'author': content = strip(''.join(self.contentData)) content = content.replace('"',"'*") self.authors.append(content.encode('ascii', 'replace')) self.contentData = [] elif name == 'title': content = strip(''.join(self.contentData)) content = content.replace('"',"'*") self.title = content.encode('ascii', 'replace') self.contentData = [] elif name == 'year': self.year = strip(''.join(self.contentData)) self.contentData = [] else: self.contentData = [] def characters(self, chars, offset, length): self.contentData.append(chars[offset:offset+length]) def run(workDir,inFileName): global pubs print 'DBLP Sax Document Extractor' print 'by Vladimir Batagelj, April 2006' pubs = [ 'article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www' ] sys.path.append(workDir) name = inFileName[:inFileName.rfind('.')] outFile = file(workDir+name+'.out', 'w') outFile.write('DBLP Sax Document Extractor, by Vladimir Batagelj, April 2006\n') outFile.write('FILE='+workDir+inFileName+'\n') outFile.write('DATE=%s\n' % time.ctime(time.time())) # Create an instance of the Handler. handler = DblpSaxDocumentHandler(outFile) # Create an instance of the parser. parser = saxexts.make_parser() # Set the document handler. parser.setDocumentHandler(handler) inFile = file(workDir+inFileName, 'r') # Start the parse. start = time.clock() parser.parseFile(inFile) print 'elapsed time =', time.clock()-start, 'seconds' inFile.close(); outFile.close()