#!/usr/bin/python #------------------------------------------------------------------------------ # Gml2Pajek.py: IMDB/GML XML file data extractor # # by Vladimir Batagelj, April 2006 # # typical usage: # import Gml2Pajek # Gml2Pajek.run('D:\\vlado\\work\\Python\\IMDB\\SAX\\','imdb.graphml') #------------------------------------------------------------------------------ # based on # http://www.rexx.com/~dkuhlman/pyxmlfaq.html#howsax # http://www.rexx.com/~dkuhlman/pyxmlfaq.html#howsaxhandler # GraphML: http://graphml.graphdrawing.org/ # IMDB/GraphML: http://www.ul.ie/gd2005/dataset.html # copy: http://vlado.fmf.uni-lj.si/pub/networks/data/GD/gd05/dataset.html # IMDB org: http://www.imdb.com/ #------------------------------------------------------------------------------ import sys, string, time, xml.sax from xml.sax import handler from xml.sax import saxexts from string import strip, join, lower class GMLSaxDocumentHandler(handler.ContentHandler): def __init__(self, netFile): self.net = netFile self.inContent = 0 self.contentData = [] self.node = '' self.test = 0 self.key = '' self.year = 0 self.title = '*** no title' self.movie = 0 self.type = 22 self.numRec = 0 def startDocument(self): print "-------- Document Start --------" def endDocument(self): print "-------- Document End ----------" print self.numRec, ' nodes and edges read' def startElement(self, name, attrs): global wTab, aTab, links if name == 'node': self.numRec += 1 if len(attrs.keys()) > 0: for attrName in attrs.keys(): if attrName == 'id': self.node = attrs.get(attrName) elif name == 'data': if len(attrs.keys()) > 0: for attrName in attrs.keys(): if attrName == 'key': self.key = attrs.get(attrName) elif name == 'edge': self.numRec += 1 if len(attrs.keys()) > 0: for attrName in attrs.keys(): if attrName == 'source': self.source = attrs.get(attrName) elif attrName == 'target': self.target = attrs.get(attrName) if wTab.has_key(self.source): val = wTab[self.source]; src = val[0]; yr = val[2] else: src = 0; yr = 2100 if aTab.has_key(self.target): val = aTab[self.target]; tgt = val[0] else: tgt = 0 if src*tgt > 0: if yr > 1850: aTab[self.target] = [tgt,min(val[1],yr),max(val[2],yr),self.target] links[src].append(tgt) else: print "unknown nodes", self.numRec, self.source, self.target if self.test > 0: print 'edge', self.source, src, self.target, tgt, yr else: print "bad edge info", self.numRec def endElement(self, name): global types, wTab, aTab, links, nam if name == 'node': if self.test > 0: print "node", self.numRec, self.movie, self.type, self.year, self.node, self.title if self.movie: if self.node != '': numWrk = len(wTab) if wTab.has_key(self.node): print 'duplicated work', self.numRec, self.node, self.title self.node = 'DUP-'+str(numVer) wTab[self.node]=[numWrk,self.type,self.year,self.node] nam.write('%s \"%s\"\n' % (numWrk,self.title)) links[numWrk] = [self.year] else: print "incomplete data" else: if self.node != '': if aTab.has_key(self.node): print 'duplicated author', self.numRec, self.node, self.title else: numAut = len(aTab) if self.year > 1850: aTab[author] = [numAut,self.year,self.year,self.node] else: aTab[self.node] = [numAut,2100,0,self.node] nam.write('%s \"%s\"\n' % (len(wTab)+numAut-1,self.title)) self.movie = 0; self.type = 22; self.node ='' self.year = 0; self.title = '*** no title' self.contentData = [] elif name == 'data': content = strip(''.join(self.contentData)) content = content.replace('"',"'*") if self.key == 'k0': self.title = content.encode('ascii','xmlcharrefreplace') elif self.key == 'k1': self.movie = (lower(content) == 'true') elif self.key == 'k2': self.year = int(content) elif self.key == 'k3': try: i = types.index(lower(content))+1 except ValueError: i = 99; print "error type" self.type = i self.contentData = [] else: self.contentData = [] def characters(self, chars, offset, length): self.contentData.append(chars[offset:offset+length]) def run(workDir,inFileName): global types, links, wTab, aTab, nam types = [ 'actor', 'action', 'adult', 'adventure', 'animation', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'film-noir', 'horror', 'music', 'musical', 'mystery', 'romance', 'sci-fi', 'short', 'thriller', 'war', 'western', 'unknown' ] print 'GML to Pajek Converter' print 'by Vladimir Batagelj, April 2006' sys.path.append(workDir) name = inFileName[:inFileName.rfind('.')] net = file(workDir+name+'.net', 'w') print 'FILE='+workDir+inFileName print ('DATE=%s\n' % time.ctime(time.time())) nam = open(workDir+name+'.nam','w') print 'namesFile = ', workDir+name+'.nam' nam.write('*vertices\n') vec = open(workDir+name+'.vec','w') print 'yearFile = ', workDir+name+'.vec' clu = open(workDir+name+'.clu','w') print 'typeFile = ', workDir+name+'.clu' # Create an instance of the Handler. handler = GMLSaxDocumentHandler(net) # Create an instance of the parser. parser = saxexts.make_parser() # Set the document handler. parser.setDocumentHandler(handler) inFile = file(workDir+inFileName, 'r') # Start the parse. links = {}; wTab = {'0':['0']}; aTab = {'0':['0']} start = time.clock() parser.parseFile(inFile) # export network del wTab['0']; del aTab['0']; numWrk = len(wTab); numAut = len(aTab); numAll = numWrk+numAut print ('# of works=%s, authors=%s\n' % (numWrk,numAut)) for i in range(len(types)): clu.write('% '); clu.write('%2i - %s\n' % (i,types[i])) clu.write('% 99 - other\n') clu.write('*vertices %s \n' % numAll) net.write('% GML2Pajek converter: GML -> Pajek\n') net.write('% by Vladimir Batagelj, April 2006\n') net.write('% GML File = '+workDir+inFileName+'\n% ') net.write('%s\n' % time.ctime(time.time())) net.write('*vertices %s %s\n' % (numAll,numWrk)) vec.write('*vertices %s \n' % numWrk) L=wTab.values(); L.sort() for t in L: if t[2] > 0: net.write('%6i \"%s\" [%i]\n' % (t[0],t[3],t[2])) else: net.write('%6i \"%s\" [1990-*]\n' % (t[0],t[3])) clu.write('%s\n' % t[1]) vec.write('%s\n' % t[2]) L=aTab.values(); L.sort() for t in L: net.write('%6i \"%s\" [%i-%i]\n' % (numWrk+t[0],t[3],t[1],t[2])) clu.write('%s\n' % 0) net.write('*Arcs\n') for w in sorted(links.keys()): t = links[w] for a in t[1:]: net.write('%5i %5i 1 [%4i]\n' % (w,numWrk+a,t[0])) print 'elapsed time =', time.clock()-start, 'seconds' inFile.close(); net.close(); nam.close(); vec.close(); clu.close()