#!/usr/bin/python #------------------------------------------------------------------------------ # Dblp2Pajek.py: DBLP XML data to Pajek Convertor # # by Vladimir Batagelj, April 2006 # # typical usage: # import Dblp2Pajek # Dblp2Pajek.run('D:\\vlado\\docs\\Papers\\2006\\SS05apr06\\DBLP\\','dblpT.out') #------------------------------------------------------------------------------ from string import strip,split,replace,lower,index from sys import argv,exit import re, time def run(workdir,input): pubs = [ 'article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', 'mastersthesis', 'www' ] try: # open input DBLP file bib = open(workdir+input,'r') except IOError , (errno, strerror): print "I/O error(%s): %s" % (errno, strerror) exit() print 'dblpFile = ', workdir+input name = input[:input.rfind('.')] net = open(workdir+name+'.net','w') print 'pajekFile = ', workdir+name+'.net' nam = open(workdir+name+'.nam','w') print 'namesFile = ', workdir+name+'.nam' nam.write('*vertices\n') vec = open(workdir+name+'.vec','w') print 'yearFile = ', workdir+name+'.vec' clu = open(workdir+name+'.clu','w') print 'typeFile = ', workdir+name+'.clu' aut = open(workdir+name+'.aut','w') eqv = open(workdir+name+'eqv.clu','w') links = {}; wTab = {'0':['0']}; aTab = {'0':['0']} numRec = 0; numVer = 0; zero = 0 numLine = 0; last = 0; start = time.clock() print "-------- Document Start --------" while not last: line = bib.readline() if not line: line = '@'; last = 1 else: line = line.strip(); numLine += 1 ch = line[0] if ch == '[': numRec += 1 elif ch == 'K': work = key = line[2:]; # work = key[0]+key[key.rfind('/'):] elif ch == 'Y': year = line[2:]; m = re.search('\D',year) if m: year = year[:m.start()] try: year = int(year) except ValueError: # print 'bad year in record %s : %s' % (numRec,key) year = 0; zero += 1 elif ch == 'T': title = line[2:] elif ch == 'W': wtype = int(line[2:]) elif ch == 'A': author = line[2:]; ok = work != '' # and year > 0 if ok: if aTab.has_key(author): aVal = aTab[author]; numAut = aVal[0] if year > 0: aTab[author] = [numAut,min(aVal[1],year),max(aVal[2],year),author] else: numAut = len(aTab) if year > 0: aTab[author] = [numAut,year,year,author] else: aTab[author] = [numAut,2100,0,author] authors.append(numAut) elif ch == ']' and ok: numVer += 1 if wTab.has_key(work): print 'duplicated key', numRec, key, title work = 'DUP-'+str(numVer) wTab[work]=[numVer,wtype,year,work] nam.write('%s \"%s\"\n' % (numVer,title)) links[work] = authors # print numVer, numRec, wtype, work, author elif ch == 'E': last = 1 if ch == ']' or ch == 'I' or ch == 'S': ok = 0; key = ''; year = 0; title = ''; wtype = 0; authors = [] print "-------- Document End ----------" print "-------- Export Network --------" del wTab['0']; del aTab['0']; numAut = len(aTab); numAll = numVer+numAut print ('# of lines=%s, records=%s, works=%s, authors=%s, year0=%s\n' % (numLine,numRec,numVer,numAut,zero)) net.write('% Dblp2Pajek converter (2nd pass): DBLP.out -> Pajek\n') net.write('% by Vladimir Batagelj, April 2006\n') net.write('% DBLP File = '+workdir+input+'\n% ') net.write('%s\n' % time.ctime(time.time())) net.write('*vertices %s %s\n' % (numAll,numVer)) vec.write('*vertices %s \n' % numVer) clu.write('% 0 - author\n') for i in range(len(pubs)): clu.write('% '); clu.write('%2i - %s\n' % (i+1,pubs[i])) clu.write('*vertices %s \n' % numAll) L=wTab.values(); L.sort() for t in L: if t[2] > 0: net.write('%6i \"%s\" [%i]\n' % (t[0],t[3],t[2])) else: net.write('%6i \"%s\" [1990-*]\n' % (t[0],t[3])) clu.write('%s\n' % t[1]) vec.write('%s\n' % t[2]) L=aTab.values(); L.sort() for t in L: net.write('%6i \"%s\" [%i-%i]\n' % (numVer+t[0],t[3],t[1],t[2])) clu.write('%s\n' % 0) net.write('*Edges\n') for w in links.keys(): t = wTab[w] for a in links[w]: net.write('%5i %5i 1 [%4i]\n' % (t[0],numVer+a,t[2])) L.sort(lambda x,y: cmp(lower(x[3][x[3].rfind(' '):]),lower(y[3][y[3].rfind(' '):]))) # L.sort(lambda x,y: cmp(x[3],y[3])) for t in L: aut.write('%6i \"%s\"\n' % (numVer+t[0],t[3])) eqv.write('*vertices %s \n' % numAll) for i in range(numAll): eqv.write('%s\n' % (i+1)) print 'elapsed time =', time.clock()-start, 'seconds' net.flush(); nam.flush(); vec.flush(); clu.flush(); aut.flush(); eqv.flush() bib.close(); net.close(); nam.close(); vec.close(); clu.close(); aut.close(); eqv.close() print "-------- Dblp2Pajek - done -----" # # run dblp2pajek # if __name__ == '__main__': # run it from command line if len(sys.argv) == 3: run(argv[1],argv[2]) else: print "Module Dblp2Pajek" print "Two arguments (WorkDir, DBLPfile) required to run !" exit() print else: # it is imported print "Module Dblp2Pajek imported." print "To run, type: dblp2pajek.run('D:\\vlado\\DBLP\\','DBLP.out')" print "where 'DBLP.out' is converted (using DblpSaxDH) DBLP file" #- End -------------------------------------------------------------------------------