#!/usr/bin/python ########################################################################### # # WoS2Pajek - Transforming WoS bibliographies into Pajek # # Wos2Pajek.run(wdir,MLdir,project,bibfile,maxvert,ISIn,makeClean,listep) # # import sys; wdir = r'c:\users\Batagelj\work\Python\WoS'; sys.path.append(wdir) # MLdir = r'c:\Python25\Lib\site-packages\MontyLingua-2.1\Python' # sys.path.append(MLdir) # # import WoS2Pajek; WoS2Pajek.run(wdir,MLdir,'batagelj','batagelj.WoS',500,False,True,10) # reload(WoS2Pajek); WoS2Pajek.run(wdir,MLdir,'batagelj','batagelj.WoS',500,False,True,1) # reload(WoS2Pajek); WoS2Pajek.run(wdir,MLdir,'SN','SN.WoS',200000,False,False,500) # reload(WoS2Pajek); WoS2Pajek.run(wdir,MLdir,'SN','SN.WoS',200000,False,False,0) # reload(WoS2Pajek); WoS2Pajek.run(wdir,MLdir,'VBAF','VBAFall.WoS',50000,False,False,50) # reload(WoS2Pajek); WoS2Pajek.run(wdir,MLdir,'VBAF','VBAF.WoS',2000,False,False,1) # # Vladimir Batagelj, 23-30. March 2007 # # 1. 12-14. August 2007 add Journals and Keywords (ID+DE+TI) # 2. 24. August 2007 add number of pages # 3. 6. December 2007 add Abstracts (AB) and lemmatization, timestamps # 4. 27. December 2007 remove DOI info; consider ARTN/AR and UNSP in ISIname # 5. 17-24. June 2008 file without duplicates # To do: cite -> self, other ########################################################################### import string, os, shutil, sys, re, datetime def lemmatize(ML,ab,stopwords): sLto = [ML.tokenize(st) for st in ML.split_sentences(ab.lower())] sLta = [ML.tag_tokenized(t) for t in sLto] lem = [ML.lemmatise_tagged(t) for t in sLta] lemas = [s.split('/')[2] for s in string.join(lem).split(' ')] return list(set(dropList(lemas,stopwords))) def dropList(mylist, rmlist): def testfun(somestring, checklist=rmlist): return somestring not in checklist mylist = filter(testfun, mylist) return mylist def ISIname(AU,PY,J9,VL,BP,AR): # from work description produces its ISI name name = AU.upper().replace(' ','').replace(',',' ') if PY != '': name = name+', '+PY if J9 != '': name = name+', '+J9[:20] if VL != '': name = name+', V'+VL if AR != '': name = name+', ARTN '+AR elif BP != '': name = name+', P'+BP return name def nameG(name): # transforms an ISI name into a short name # if the name is 'non-standard' it takes it # as a short name and reports a warning s = name.split(", ") y = 0; v = ''; p = ''; n = '' if s[0][0] == '*' : n = s[0] else: q = s[0].split(" ") n = q[0][:8] # if len(q) > 1 : n = n+'_'+q[1][0] try: if len(q) > 1 : n = n+'_'+q[1][0] except: print "*** Error ", name, " : ", n if len(s) >= 2 : try: y = eval(s[1]) except: pass if len(s) >= 4 : if s[3][0] == 'V' : v = s[3][1:] if len(s) >= 5 : if s[4][0] == 'P' : p = s[4][1:] elif s[4].startswith('ARTN'): p = s[4][5:] elif s[4].startswith('UNSP'): p = s[4][5:] elif s[3][0] == 'P' : p = s[3][1:] elif s[3].startswith('ARTN'): p = s[3][5:] elif s[3].startswith('UNSP'): p = s[3][5:] if (y == 0) and (len(s) > 1): try: y = eval(s[0]) n = '$'+s[1].replace(' ','_') except: pass if (v == '') and (p == '') and (y == 0) and (len(s) > 1) : p = s[1].replace(' ','_') if n != '' : return n+'('+str(y)+')'+v+':'+p # non-standard print "--> ", name return name def Gname(AU,PY,VL,BP,AR): # from work description produces its short name name = AU.upper() s = name.split(", ") name = (s[0].replace(' ',''))[:8]+'_' if len(s) > 1: name = name + s[1][0] return name+'('+str(PY)+')'+VL+':'+BP if AR != '': return name+'('+str(PY)+')'+VL+':'+AR else: return name+'('+str(PY)+')'+VL+':'+BP def infVertex(name): # determines the Pajek's number of a vertex with a given # name; if the name contains the character " a warning # is reported global nver, vert, maxver, nodes, years, ISI, step if vert.has_key(name): # print 'Old vertex(', vert[name], '): ' + name return vert[name] else: if ISI : s = name.split(", ") try: year = eval(s[1]) except: year = 0 else: z = name.find('(')+1; k = name.find(')') year = name[z:k] nver = nver + 1; if nver >= maxver : try: input("\n*** max vertex number "+str(maxver)+ " exceeded\n*** program exited\n\nPress Enter") except: pass exit(0) vert[name] = nver # if (nver % 1000) == 0: # print 'New vertex(', nver, '): ' + name nodes.write(str(nver)+' "'+name+'"\n') years.write(str(year)+'\n') if name.find('"') >= 0: print '***** Bad label ',nver,' : ',name return nver def infAuthor(name): # determines the Pajek's number of an author global naut, aut, authors if aut.has_key(name): return aut[name] else: naut = naut + 1; aut[name] = naut authors.write(str(naut)+' "'+name+'"\n') return naut def infJournal(name): # determines the Pajek's number of a journal global njr, jour, journals if jour.has_key(name): return jour[name] else: njr = njr + 1; jour[name] = njr journals.write(str(njr)+' "'+name+'"\n') return njr def infKeyword(name): # determines the Pajek's number of a keyword global nkw, keyw, keywords if keyw.has_key(name): return keyw[name] else: nkw = nkw + 1; keyw[name] = nkw keywords.write(str(nkw)+' "'+name+'"\n') return nkw def run(wdir,MLdir,project,bibfile,maxvert,ISIn,makeClean,listep): import MontyLingua global nver, vert, maxver, nodes, ISI, aut, naut, years, authors global step, comlin, jour, njr, nkw, keyw, keywords, journals ML = MontyLingua.MontyLingua() maxver = maxvert; ISI = ISIn step = 1000000 if listep > 0: step = listep t1 = datetime.datetime.now() if not comlin: print "\n*** WoS2Pajek - 0.5" print "by V. Batagelj, June 24, 2008 / March 23, 2007\n" print "started: "+t1.ctime()+"\n" workdir = wdir+'\\'+project+'\\' vertype = [ 0 for i in range(maxver) ] numpages = [ 0 for i in range(maxver) ] try: bib = open(workdir+bibfile, 'r') except: print 'wrong WoS file' exit() stopwords = open(workdir+'StopWords.dat', 'r').read().lower().split() stopwords = ['.',',',';','(',')','[',']','"','=','?','!',':','-','s','']+stopwords nodes = open(workdir+'nodes.tmp', 'w') arcs = open(workdir+'arcs.tmp', 'w') arcs.write('*arcs \n') temp = open(workdir+'works.tmp', 'w') authors = open(workdir+'authors.tmp', 'w') years = open(workdir+'years.tmp', 'w') journals = open(workdir+'journals.tmp', 'w') jourlink = open(workdir+'jourlink.tmp', 'w') keywords = open(workdir+'keywords.tmp', 'w') keylink = open(workdir+'keylink.tmp', 'w') if makeClean: clean = open(workdir+'clean.WoS', 'w') copyLine = True; lines = '' vert = {}; nver = 0 aut = {}; naut = 0 keyw = {}; nkw = 0 jour = {}; njr = 1 jour['*****'] = njr numbib = 0; endbib = False; numdup = 0; lines = "" while not endbib: line = bib.readline() if not line: break lines = lines + line control = line[:2] if control != ' ': state = control content = line[3:-1] if control =='PT': numbib = numbib + 1 bibAU = ''; bibCR = ''; bibPT = content; bibPG = ''; bibAR = '' bibVL = ''; bibJ9 = '*****'; bibBP = ''; bibEP = '' bibTI = ''; bibID = ''; bibDE = ''; bibAB = ''; bibPY = '0' listCR = []; listAU = [] elif control =='AU': if bibAU == '': bibAU = content elif control == 'J9': bibJ9 = content elif control == 'PY': bibPY = content elif control == 'VL': bibVL = content elif control == 'BP': bibBP = content elif control == 'EP': bibEP = content elif control == 'PG': bibPG = content elif control == 'AR': bibAR = content; bibBP = content elif control == '**': print "WoS query: ", content elif control == 'ER': if ISI : name = ISIname(bibAU,bibPY,bibJ9,bibVL,bibBP,bibAR) else: name = Gname(bibAU,bibPY,bibVL,bibBP,bibAR) sind = infVertex(name) journal = bibJ9 ijour = infJournal(journal) jourlink.write(str(sind)+' '+str(ijour)+'\n') try: if bibPG != '': numpages[sind] = eval(bibPG) else: numpages[sind] = 1 + eval(bibEP) - eval(bibBP) except: pass if listCR != []: if (numbib % step) == 0: print numbib,':',name if vertype[sind] : print "*** duplicate: ", ISIname(bibAU,bibPY,bibJ9,bibVL,bibBP,bibAR) numdup = numdup + 1 if makeClean: copyLine = False else: vertype[sind] = 1 for tind in listCR: arcs.write(str(sind)+' '+str(tind)+'\n') for aind in listAU: temp.write(str(sind)+' '+str(aind)+'\n') else: if vertype[sind] : print "*** duplicate: ", ISIname(bibAU,bibPY,bibJ9,bibVL,bibBP,bibAR) numdup = numdup + 1 if makeClean: copyLine = False else: vertype[sind] = 1 # titwords=re.split('\W',bibTI.lower()) # words = [token for token in titwords if token not in stopwords] # kwords = [token.strip() for token in (bibID+';'+bibDE).lower().split(';') if len(token) > 0] # words = words + kwords ic = bibAB.lower().rfind("(c)") if ic > 0: bibAB = bibAB[:ic] words = lemmatize(ML,(bibTI+'. '+bibID+';'+bibDE+'. '+bibAB).lower().replace("'"," "),stopwords) for w in words: ikey = infKeyword(w) keylink.write(str(sind)+' '+str(ikey)+'\n') if makeClean: if copyLine and (lines != ''): clean.write(lines) lines = ''; copyLine = True # elif control == 'EF': # endbib = True else: pass if state == 'CR': # remove DOI info if content.endswith('DOI') : line = bib.readline() id=content.rfind(',') if content[id+2:].startswith('DOI') : content = content[:id] if ISI : name = content else: name = nameG(content) newWork = not vert.has_key(name) work = infVertex(name) listCR.append(work) if newWork: if ISI : author = name[:name.find(',')] elif (name[0]=='*') or (name[0]=='$') : author = 'UNKNOWN' else: author = name[:name.find('(')] iaut = infAuthor(author) temp.write(str(work)+' '+str(iaut)+'\n') ll = content.split(', ') journal = '*****' if len(ll) == 5 : journal = ll[2] ijour = infJournal(journal) jourlink.write(str(work)+' '+str(ijour)+'\n') if state == 'AU': if ISI : author = content.upper().replace(' ','').replace(',',' ') else: name = content.upper() s = name.split(", ") author = (s[0].replace(' ',''))[:8]+'_' if len(s) > 1: author = author + s[1][0] listAU.append(infAuthor(author)) if state == 'TI': bibTI = bibTI + ' ' + content if state == 'ID': bibID = bibID + ' ' + content if state == 'DE': bibDE = bibDE + ' ' + content if state == 'AB': bibAB = bibAB + ' ' + content if makeClean: if copyLine and (lines != ''): clean.write(lines) clean.close() bib.close(); nodes.close(); arcs.close(); journals.close() jourlink.close(); authors.close(); years.close(); temp.close() keylink.close(); keywords.close() print "number of works = ",nver print "number of authors = ",naut print "number of journals = ",njr print "number of keywords = ",nkw print "number of records = ",numbib print "number of duplicates = ",numdup if makeClean: print "clean WoS data: "+'clean.WoS' # year of publication partition print "\n*** FILES:\nyear of publication partition: "+project+'Year.clu' yinp = open(workdir+'years.tmp', 'r') years = open(workdir+project+'Year.clu', 'w') years.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n") years.write('*vertices '+str(nver)+'\n') shutil.copyfileobj(yinp,years) yinp.close(); years.close() # described / cited only partition print "described / cited only partition: "+project+'DC.clu' part = open(workdir+project+'DC.clu', 'w') part.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n") part.write('*vertices '+str(nver)+'\n') for i in range(nver): part.write(str(vertype[i+1])+'\n') part.close() # number of pages vector print "number of pages vector: "+project+'NP.vec' vect = open(workdir+project+'NP.vec', 'w') vect.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n") vect.write('*vertices '+str(nver)+'\n') for i in range(nver): vect.write(str(numpages[i+1])+'\n') vect.close() # citation network print "citation network: "+project+'Cite.net' nodes = open(workdir+'nodes.tmp', 'r') net = open(workdir+project+'Cite.net', 'w') net.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n") net.write('*vertices '+str(nver)+'\n') shutil.copyfileobj(nodes,net) arcs = open(workdir+'arcs.tmp', 'r') shutil.copyfileobj(arcs,net) arcs.close(); net.close(); nodes.close() # works X journals network *** print "works X journals network: "+project+'WJ.net' nodes = open(workdir+'nodes.tmp', 'r') tj = open(workdir+'journals.tmp', 'r') wj = open(workdir+project+'WJ.net', 'w') wj.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n") wj.write('*vertices '+str(nver+njr)+' '+str(nver)+'\n') shutil.copyfileobj(nodes,wj) nodes.close() while True: line = tj.readline() if not line: break s = line.split(" ",1) wj.write(str(eval(s[0])+nver)+' '+s[1]) temp = open(workdir+'jourlink.tmp', 'r') wj.write('*arcs\n') while True: line = temp.readline() if not line: break s = line.split(" ") wj.write(s[0]+' '+str(eval(s[1])+nver)+'\n') temp.close(); wj.close(); tj.close() # works X keywords network print "works X keywords network: "+project+'WK.net' nodes = open(workdir+'nodes.tmp', 'r') tk = open(workdir+'keywords.tmp', 'r') wk = open(workdir+project+'WK.net', 'w') wk.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n") wk.write('*vertices '+str(nver+naut)+' '+str(nver)+'\n') shutil.copyfileobj(nodes,wk) nodes.close() while True: line = tk.readline() if not line: break s = line.split(" ",1) wk.write(str(eval(s[0])+nver)+' '+s[1]) temp = open(workdir+'keylink.tmp', 'r') wk.write('*arcs\n') while True: line = temp.readline() if not line: break s = line.split(" ") wk.write(s[0]+' '+str(eval(s[1])+nver)+'\n') temp.close(); wk.close(); tk.close() # works X authors network print "works X authors network: "+project+'WA.net' nodes = open(workdir+'nodes.tmp', 'r') taut = open(workdir+'authors.tmp', 'r') wa = open(workdir+project+'WA.net', 'w') wa.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n") wa.write('*vertices '+str(nver+naut)+' '+str(nver)+'\n') shutil.copyfileobj(nodes,wa) nodes.close() while True: line = taut.readline() if not line: break s = line.split(" ") wa.write(str(eval(s[0])+nver)+' '+s[1]) temp = open(workdir+'works.tmp', 'r') wa.write('*arcs\n') while True: line = temp.readline() if not line: break s = line.split(" ") wa.write(s[0]+' '+str(eval(s[1])+nver)+'\n') temp.close(); wa.close(); taut.close() try: os.remove(workdir+'works.tmp') os.remove(workdir+'authors.tmp') os.remove(workdir+'years.tmp') os.remove(workdir+'arcs.tmp') os.remove(workdir+'nodes.tmp') os.remove(workdir+'keywords.tmp') os.remove(workdir+'keylink.tmp') os.remove(workdir+'journals.tmp') os.remove(workdir+'jourlink.tmp') except: print "unable to delete some temp files" t2 = datetime.datetime.now() print "finished: "+t2.ctime() print "time used: ", t2-t1 print "***" # # Run Wos2Pajek # global comlin if __name__ == '__main__': comlin = True print "\n*** WoS2Pajek - 0.5" print "by V. Batagelj, June 24, 2008 / March 23, 2007\n" if len(sys.argv) == 9: for x in sys.argv[1:]: print x print "------------------------" wdir = sys.argv[1] MLdir = sys.argv[2] project = sys.argv[3] bibfile = sys.argv[4] maxvert = eval(sys.argv[5]) ISIn = eval(sys.argv[6]) makeClean = eval(sys.argv[7]) listep = eval(sys.argv[8]) else: print "8 arguments required to run !" wdir = input("WoS directory = ") MLdir = input("MontyLingua directory = ") project = input("project subdirectory = ") bibfile = input("WoS file = ") maxvert = input("max num of vertices = ") ISIn = input("ISInumber (True/False = ") makeClean = input("makeClean (True/False = ") listep = input("list step = ") if MLdir == '': MLdir = r'c:\Python25\Lib\site-packages\MontyLingua-2.1\Python' sys.path.append(MLdir) run(wdir,MLdir,project,bibfile,maxvert,ISIn,makeClean,listep) print a = input("Close console?") else: comlin = False print "Module Wos2Pajek imported.\n" print "To run, type:" print " Wos2Pajek.run(wdir,MLdir,project,bibfile,maxvert,ISIn,makeClean,listep)" print "for example:" print " Wos2Pajek.run(r'D:\Vlado\work\Python\WoS',r'c:\Python25\Lib\site-packages\MontyLingua-2.1\Python','SN','SN.WoS',200000,False,True,500)\n" #- End -------------------------------------------------------------------------------