#!/usr/bin/python ########################################################################### # # WoS2Pajek - Transforming WoS bibliographies into Pajek # # WoS2Pajek.run(project,WoSbib) # # import sys; sys.path.append(r'D:\Vlado\work\Python\WoS') # # import WoS2Pajek; WoS2Pajek.run('batagelj','batagelj.WoS',500,False,10) # reload(WoS2Pajek); WoS2Pajek.run('batagelj','batagelj.WoS',500,False,1) # reload(WoS2Pajek); WoS2Pajek.run('SN','SN.WoS',200000,False,500) # reload(WoS2Pajek); WoS2Pajek.run('SN','SN.WoS',200000,False,0) # reload(WoS2Pajek); WoS2Pajek.run('VBAF','VBAFall.WoS',50000,False,50) # reload(WoS2Pajek); WoS2Pajek.run('VBAF','VBAF.WoS',2000,False,1) # # Vladimir Batagelj, 23-30. march 2007 # ########################################################################### import string, os, shutil def ISIname(AU,PY,J9,VL,BP): name = AU.upper().replace(' ','').replace(',',' ') if PY != '': name = name+', '+PY if J9 != '': name = name+', '+J9[:20] if VL != '': name = name+', V'+VL if BP != '': name = name+', P'+BP return name def nameG(name): s = name.split(", ") y = 0; v = ''; p = ''; n = '' if s[0][0] == '*' : n = s[0] else: q = s[0].split(" ") n = q[0][:8] if len(q) > 1 : n = n+'_'+q[1][0] if len(s) >= 2 : try: y = eval(s[1]) except: pass if len(s) >= 4 : if s[3][0] == 'V' : v = s[3][1:] if len(s) >= 5 : if s[4][0] == 'P' : p = s[4][1:] elif s[3][0] == 'P' : p = s[3][1:] if (y == 0) and (len(s) > 1): try: y = eval(s[0]) n = '$'+s[1].replace(' ','_') except: pass if (v == '') and (p == '') and (y == 0) and (len(s) > 1) : p = s[1].replace(' ','_') if n != '' : return n+'('+str(y)+')'+v+':'+p # non-standard print "--> ", name return name def Gname(AU,PY,VL,BP): name = AU.upper() s = name.split(", ") name = s[0].replace(' ','')[:8]+'_' if len(s) > 1: name = name + s[1][0] return name+'('+str(PY)+')'+VL+':'+BP def infVertex(name): global nver, vert, maxver, nodes, years, ISI, step if vert.has_key(name): # print 'Old vertex(', vert[name], '): ' + name return vert[name] else: if ISI : s = name.split(", ") try: year = eval(s[1]) except: year = 0 else: z = name.find('(')+1; k = name.find(')') year = name[z:k] nver = nver + 1; if nver >= maxver : print "\n*** max vertex number exceeded\n*** program exited" exit(0) vert[name] = nver # if (nver % 1000) == 0: # print 'New vertex(', nver, '): ' + name nodes.write(str(nver)+' "'+name+'"\n') years.write(str(year)+'\n') if name.find('"') >= 0: print '***** Bad label ',nver,' : ',name return nver def infAuthor(name): global naut, aut, authors if aut.has_key(name): return aut[name] else: naut = naut + 1; aut[name] = naut authors.write(str(naut)+' "'+name+'"\n') return naut def run(project,bibfile,maxvert,ISIn,listep): global nver, vert, maxver, nodes, ISI, aut, naut, years, authors, step maxver = maxvert; ISI = ISIn step = 1000000 if listep > 0: step = listep print "\n*** WoS2Pajek - 0.1" print "by V. Batagelj, April 3, 2007 / March 23, 2007\n" workdir = 'd:\\vlado\\work\\python\\WoS\\'+project+'\\' vertype = [ 0 for i in range(maxver) ] try: bib = open(workdir+bibfile, 'r') except: print 'wrong WoS file' exit() nodes = open(workdir+'nodes.tmp', 'w') arcs = open(workdir+'arcs.tmp', 'w') arcs.write('*arcs \n') temp = open(workdir+'works.tmp', 'w') authors = open(workdir+'authors.tmp', 'w') years = open(workdir+'years.tmp', 'w') vert = {}; nver = 0 aut = {}; naut = 0 numbib = 0; endbib = False; while not endbib: line = bib.readline() if not line: break control = line[:2] if control != ' ': state = control content = line[3:-1] if control =='PT': numbib = numbib + 1 bibAU = ''; bibCR = ''; bibPT = content; bibVL = ''; bibJ9 = ''; bibBP = ''; bibPY = '0'; listCR = []; listAU = [] elif control =='AU': if bibAU == '': bibAU = content elif control == 'J9': bibJ9 = content elif control == 'PY': bibPY = content elif control == 'VL': bibVL = content elif control == 'BP': bibBP = content elif control == 'ER': if listCR != []: if ISI : name = ISIname(bibAU,bibPY,bibJ9,bibVL,bibBP) else: name = Gname(bibAU,bibPY,bibVL,bibBP) if (numbib % step) == 0: print numbib,':',name sind = infVertex(name) if vertype[sind] : print "*** duplicate: ", ISIname(bibAU,bibPY,bibJ9,bibVL,bibBP) else: vertype[sind] = 1 for tind in listCR: arcs.write(str(sind)+' '+str(tind)+'\n') for aind in listAU: temp.write(str(sind)+' '+str(aind)+'\n') elif control == 'EF': endbib = True else: pass if state == 'CR': if ISI : name = content else: name = nameG(content) newWork = not vert.has_key(name) work = infVertex(name) listCR.append(work) if newWork: if ISI : author = name[:name.find(',')] elif (name[0]=='*') or (name[0]=='$') : author = 'UNKNOWN' else: author = name[:name.find('(')] iaut = infAuthor(author) temp.write(str(work)+' '+str(iaut)+'\n') if state == 'AU': if ISI : author = content.upper().replace(' ','').replace(',',' ') else: name = content.upper() s = name.split(", ") author = s[0].replace(' ','')[:8]+'_' if len(s) > 1: author = author + s[1][0] listAU.append(infAuthor(author)) bib.close(); nodes.close(); arcs.close() authors.close(); years.close(); temp.close() print "number of works = ",nver print "number of authors = ",naut # year of publication partition print "\n*** FILES:\nyear of publication partition: "+project+'Year.clu' yinp = open(workdir+'years.tmp', 'r') years = open(workdir+project+'Year.clu', 'w') years.write('*vertices '+str(nver)+'\n') shutil.copyfileobj(yinp,years) yinp.close(); years.close() # described / cited only - partition print "described / cited only - partition: "+project+'DC.clu' part = open(workdir+project+'DC.clu', 'w') part.write('*vertices '+str(nver)+'\n') for i in range(nver): part.write(str(vertype[i+1])+'\n') part.close() # citation network print "citation network: "+project+'Cite.net' nodes = open(workdir+'nodes.tmp', 'r') net = open(workdir+project+'Cite.net', 'w') net.write('*vertices '+str(nver)+'\n') shutil.copyfileobj(nodes,net) arcs = open(workdir+'arcs.tmp', 'r') shutil.copyfileobj(arcs,net) arcs.close(); net.close(); nodes.close() # works X authors network print "works X authors network: "+project+'WA.net' nodes = open(workdir+'nodes.tmp', 'r') taut = open(workdir+'authors.tmp', 'r') wa = open(workdir+project+'WA.net', 'w') wa.write('*vertices '+str(nver+naut)+' '+str(nver)+'\n') shutil.copyfileobj(nodes,wa) nodes.close() endfile = False; while not endfile: line = taut.readline() if not line: break s = line.split(" ") wa.write(str(eval(s[0])+nver)+' '+s[1]) temp = open(workdir+'works.tmp', 'r') wa.write('*arcs\n') while not endfile: line = temp.readline() if not line: break s = line.split(" ") wa.write(s[0]+' '+str(eval(s[1])+nver)+'\n') temp.close(); wa.close(); taut.close() try: os.remove(workdir+'works.tmp') os.remove(workdir+'authors.tmp') os.remove(workdir+'years.tmp') os.remove(workdir+'arcs.tmp') os.remove(workdir+'nodes.tmp') except: print "unable to delete some temp files" print "***\n"