#!/usr/bin/python
###########################################################################
#
#   WoS2Pajek - Transforming WoS bibliographies into Pajek
#
#   Wos2Pajek.run(wdir,MLdir,project,bibfile,maxvert,ISIn,makeClean,listep)
#
#   import sys; wdir = r'c:\users\Batagelj\work\Python\WoS'; sys.path.append(wdir)
#   MLdir = r'c:\Python25\Lib\site-packages\MontyLingua-2.1\Python'
#   sys.path.append(MLdir)
#
#   import WoS2Pajek;  WoS2Pajek.run(wdir,MLdir,'batagelj','batagelj.WoS',500,False,True,10)
#   reload(WoS2Pajek); WoS2Pajek.run(wdir,MLdir,'batagelj','batagelj.WoS',500,False,True,1)
#   reload(WoS2Pajek); WoS2Pajek.run(wdir,MLdir,'SN','SN.WoS',200000,False,False,500)
#   reload(WoS2Pajek); WoS2Pajek.run(wdir,MLdir,'SN','SN.WoS',200000,False,False,0)
#   reload(WoS2Pajek); WoS2Pajek.run(wdir,MLdir,'VBAF','VBAFall.WoS',50000,False,False,50)
#   reload(WoS2Pajek); WoS2Pajek.run(wdir,MLdir,'VBAF','VBAF.WoS',2000,False,False,1)
#
#   Vladimir Batagelj,  23-30. March 2007
#
#   1. 12-14. August 2007  add Journals and Keywords (ID+DE+TI)
#   2. 24. August 2007     add number of pages
#   3. 6. December 2007    add Abstracts (AB) and lemmatization, timestamps
#   4. 27. December 2007   remove DOI info; consider ARTN/AR and UNSP in ISIname
#   5. 17-24. June 2008    file without duplicates 
#   To do: cite -> self, other
###########################################################################

import string, os, shutil, sys, re, datetime

def lemmatize(ML,ab,stopwords):
  sLto = [ML.tokenize(st) for st in ML.split_sentences(ab.lower())]
  sLta = [ML.tag_tokenized(t) for t in sLto]
  lem = [ML.lemmatise_tagged(t) for t in sLta]
  lemas = [s.split('/')[2] for s in string.join(lem).split(' ')]
  return list(set(dropList(lemas,stopwords)))

def dropList(mylist, rmlist):
  def testfun(somestring, checklist=rmlist):
    return somestring not in checklist
  mylist = filter(testfun, mylist)
  return mylist
   
def ISIname(AU,PY,J9,VL,BP,AR):
# from work description produces its ISI name
  name = AU.upper().replace(' ','').replace(',',' ')
  if PY != '': name = name+', '+PY
  if J9 != '': name = name+', '+J9[:20]
  if VL != '': name = name+', V'+VL
  if AR != '': name = name+', ARTN '+AR
  elif BP != '': name = name+', P'+BP
  return name

def nameG(name):
# transforms an ISI name into a short name
# if the name is 'non-standard' it takes it
# as a short name and reports a warning
  s = name.split(", ")
  y = 0; v = ''; p = ''; n = ''
  if s[0][0] == '*' : n = s[0]
  else:
    q = s[0].split(" ")
    n = q[0][:8]
#   if len(q) > 1 : n = n+'_'+q[1][0]
    try:
      if len(q) > 1 : n = n+'_'+q[1][0]
    except:
      print "*** Error ", name, " : ", n
  if len(s) >= 2 :
    try: y = eval(s[1])
    except: pass
  if len(s) >= 4 :
    if s[3][0] == 'V' :
      v = s[3][1:]
      if len(s) >= 5 :
        if s[4][0] == 'P' : p = s[4][1:]
        elif s[4].startswith('ARTN'): p = s[4][5:]
        elif s[4].startswith('UNSP'): p = s[4][5:]
    elif s[3][0] == 'P' : p = s[3][1:]
    elif s[3].startswith('ARTN'): p = s[3][5:]
    elif s[3].startswith('UNSP'): p = s[3][5:]
  if (y == 0) and (len(s) > 1):
    try:
      y = eval(s[0])
      n = '$'+s[1].replace(' ','_')
    except: pass
  if (v == '') and (p == '') and (y == 0) and (len(s) > 1) :
    p = s[1].replace(' ','_')
  if n != '' : return n+'('+str(y)+')'+v+':'+p
# non-standard
  print "--> ", name
  return name

def Gname(AU,PY,VL,BP,AR):
# from work description produces its short name
  name = AU.upper()
  s = name.split(", ")
  name = (s[0].replace(' ',''))[:8]+'_'
  if len(s) > 1:
    name = name + s[1][0]
  return name+'('+str(PY)+')'+VL+':'+BP
  if AR != '': return name+'('+str(PY)+')'+VL+':'+AR
  else: return name+'('+str(PY)+')'+VL+':'+BP
  
def infVertex(name):
# determines the Pajek's number of a vertex with a given
# name; if the name contains the character " a warning
# is reported
   global nver, vert, maxver, nodes, years, ISI, step
   if vert.has_key(name):
#     print 'Old vertex(', vert[name], '): ' + name
     return vert[name]
   else:
     if ISI :
       s = name.split(", ")
       try:  year = eval(s[1])
       except: year = 0
     else:
       z = name.find('(')+1; k = name.find(')')
       year = name[z:k]
     nver = nver + 1;
     if nver >= maxver :
       try:
         input("\n*** max vertex number "+str(maxver)+
           " exceeded\n*** program exited\n\nPress Enter")
       except: pass
       exit(0)
     vert[name] = nver
#     if (nver % 1000) == 0:
#       print 'New vertex(', nver, '): ' + name
     nodes.write(str(nver)+' "'+name+'"\n')
     years.write(str(year)+'\n')
     if name.find('"') >= 0:
       print '***** Bad label ',nver,' : ',name
     return nver

def infAuthor(name):
# determines the Pajek's number of an author
   global naut, aut, authors
   if aut.has_key(name):
     return aut[name]
   else:
     naut = naut + 1;
     aut[name] = naut
     authors.write(str(naut)+' "'+name+'"\n')
     return naut

def infJournal(name):
# determines the Pajek's number of a journal
   global njr, jour, journals
   if jour.has_key(name):
     return jour[name]
   else:
     njr = njr + 1;
     jour[name] = njr
     journals.write(str(njr)+' "'+name+'"\n')
     return njr

def infKeyword(name):
# determines the Pajek's number of a keyword
   global nkw, keyw, keywords
   if keyw.has_key(name):
     return keyw[name]
   else:
     nkw = nkw + 1;
     keyw[name] = nkw
     keywords.write(str(nkw)+' "'+name+'"\n')
     return nkw

def run(wdir,MLdir,project,bibfile,maxvert,ISIn,makeClean,listep):
   import MontyLingua 
   global nver, vert, maxver, nodes, ISI, aut, naut, years, authors
   global step, comlin, jour, njr, nkw, keyw, keywords, journals
   ML = MontyLingua.MontyLingua()
   maxver = maxvert; ISI = ISIn
   step = 1000000
   if listep > 0: step = listep
   t1 = datetime.datetime.now()
   if not comlin:
     print "\n*** WoS2Pajek - 0.5"
     print "by V. Batagelj, June 24, 2008 / March 23, 2007\n"
   print "started: "+t1.ctime()+"\n"
   workdir = wdir+'\\'+project+'\\'
   vertype = [ 0 for i in range(maxver) ]
   numpages = [ 0 for i in range(maxver) ]
   try:
     bib = open(workdir+bibfile, 'r')
   except:
     print 'wrong WoS file'
     exit()
   stopwords = open(workdir+'StopWords.dat', 'r').read().lower().split()
   stopwords = ['.',',',';','(',')','[',']','"','=','?','!',':','-','s','']+stopwords
   nodes  = open(workdir+'nodes.tmp', 'w')
   arcs   = open(workdir+'arcs.tmp', 'w')
   arcs.write('*arcs \n')
   temp  = open(workdir+'works.tmp', 'w')
   authors = open(workdir+'authors.tmp', 'w')
   years  = open(workdir+'years.tmp', 'w')
   journals  = open(workdir+'journals.tmp', 'w')
   jourlink  = open(workdir+'jourlink.tmp', 'w')
   keywords  = open(workdir+'keywords.tmp', 'w')
   keylink  = open(workdir+'keylink.tmp', 'w')
   if makeClean: 
     clean  = open(workdir+'clean.WoS', 'w')
     copyLine = True; lines = ''
   vert = {}; nver = 0
   aut  = {}; naut = 0
   keyw = {}; nkw  = 0
   jour = {}; njr  = 1
   jour['*****'] = njr
   numbib = 0; endbib = False; numdup = 0; lines = ""
   while not endbib:
     line = bib.readline()
     if not line: break
     lines = lines + line
     control = line[:2]
     if control != '  ': state = control
     content = line[3:-1]
     if control =='PT':
       numbib = numbib + 1
       bibAU = ''; bibCR = ''; bibPT = content; bibPG = ''; bibAR = ''
       bibVL = ''; bibJ9 = '*****'; bibBP = ''; bibEP = ''
       bibTI = ''; bibID = ''; bibDE = ''; bibAB = ''; bibPY = '0'
       listCR = []; listAU = []
     elif control =='AU':
       if bibAU == '': bibAU = content
     elif control == 'J9': bibJ9 = content
     elif control == 'PY': bibPY = content
     elif control == 'VL': bibVL = content
     elif control == 'BP': bibBP = content
     elif control == 'EP': bibEP = content
     elif control == 'PG': bibPG = content
     elif control == 'AR': 
       bibAR = content; bibBP = content
     elif control == '**': print "WoS query: ", content
     elif control == 'ER':
       if ISI : name = ISIname(bibAU,bibPY,bibJ9,bibVL,bibBP,bibAR)
       else: name = Gname(bibAU,bibPY,bibVL,bibBP,bibAR)
       sind = infVertex(name)
       journal = bibJ9
       ijour = infJournal(journal)
       jourlink.write(str(sind)+' '+str(ijour)+'\n')
       try:
         if bibPG != '': numpages[sind] = eval(bibPG)
         else: numpages[sind] = 1 + eval(bibEP) - eval(bibBP)
       except: pass
       if listCR != []:
         if (numbib % step) == 0: print numbib,':',name
         if vertype[sind] :
           print "*** duplicate: ", ISIname(bibAU,bibPY,bibJ9,bibVL,bibBP,bibAR)
           numdup = numdup + 1
           if makeClean: copyLine = False
         else:
           vertype[sind] = 1
           for tind in listCR:
             arcs.write(str(sind)+' '+str(tind)+'\n')
           for aind in listAU:
             temp.write(str(sind)+' '+str(aind)+'\n')
       else:
         if vertype[sind] :
           print "*** duplicate: ", ISIname(bibAU,bibPY,bibJ9,bibVL,bibBP,bibAR)
           numdup = numdup + 1
           if makeClean: copyLine = False
         else:
           vertype[sind] = 1
#       titwords=re.split('\W',bibTI.lower())
#       words = [token for token in titwords if token not in stopwords]
#       kwords = [token.strip() for token in (bibID+';'+bibDE).lower().split(';') if len(token) > 0]
#       words = words + kwords
       ic = bibAB.lower().rfind("(c)")
       if ic > 0: bibAB = bibAB[:ic]
       words = lemmatize(ML,(bibTI+'. '+bibID+';'+bibDE+'. '+bibAB).lower().replace("'"," "),stopwords)
       for w in words:
         ikey = infKeyword(w)
         keylink.write(str(sind)+' '+str(ikey)+'\n')
       if makeClean:
         if copyLine and (lines != ''): clean.write(lines)
         lines = ''; copyLine = True
#    elif control == 'EF':
#      endbib = True
     else:
       pass
     if state == 'CR':
#    remove DOI info
       if content.endswith('DOI') : line = bib.readline()
       id=content.rfind(',')
       if content[id+2:].startswith('DOI') : content = content[:id]
       if ISI : name = content
       else: name = nameG(content)
       newWork = not vert.has_key(name)
       work = infVertex(name)
       listCR.append(work)
       if newWork:
         if ISI : author = name[:name.find(',')]
         elif (name[0]=='*') or (name[0]=='$') : author = 'UNKNOWN'
         else: author = name[:name.find('(')]
         iaut = infAuthor(author)
         temp.write(str(work)+' '+str(iaut)+'\n')
         ll = content.split(', ')
         journal = '*****'
         if len(ll) == 5 : journal = ll[2]
         ijour = infJournal(journal)
         jourlink.write(str(work)+' '+str(ijour)+'\n')
     if state == 'AU':
       if ISI : author = content.upper().replace(' ','').replace(',',' ')
       else:
         name = content.upper()
         s = name.split(", ")
         author = (s[0].replace(' ',''))[:8]+'_'
         if len(s) > 1: author = author + s[1][0]
         listAU.append(infAuthor(author))
     if state == 'TI': bibTI = bibTI + ' ' + content
     if state == 'ID': bibID = bibID + ' ' + content
     if state == 'DE': bibDE = bibDE + ' ' + content
     if state == 'AB': bibAB = bibAB + ' ' + content

   if makeClean: 
     if  copyLine and (lines != ''): clean.write(lines)
     clean.close()
   bib.close(); nodes.close(); arcs.close(); journals.close()
   jourlink.close(); authors.close(); years.close(); temp.close()
   keylink.close(); keywords.close()
   print "number of works      = ",nver
   print "number of authors    = ",naut
   print "number of journals   = ",njr
   print "number of keywords   = ",nkw
   print "number of records    = ",numbib
   print "number of duplicates = ",numdup
   if makeClean: print "clean WoS data: "+'clean.WoS'


# year of publication partition
   print "\n*** FILES:\nyear of publication partition: "+project+'Year.clu'
   yinp  = open(workdir+'years.tmp', 'r')
   years  = open(workdir+project+'Year.clu', 'w')
   years.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n")
   years.write('*vertices '+str(nver)+'\n')
   shutil.copyfileobj(yinp,years)
   yinp.close(); years.close()

# described / cited only partition
   print "described / cited only partition: "+project+'DC.clu'
   part  = open(workdir+project+'DC.clu', 'w')
   part.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n")
   part.write('*vertices '+str(nver)+'\n')
   for i in range(nver):
     part.write(str(vertype[i+1])+'\n')
   part.close()

# number of pages vector
   print "number of pages vector: "+project+'NP.vec'
   vect  = open(workdir+project+'NP.vec', 'w')
   vect.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n")
   vect.write('*vertices '+str(nver)+'\n')
   for i in range(nver):
     vect.write(str(numpages[i+1])+'\n')
   vect.close()

# citation network
   print "citation network: "+project+'Cite.net'
   nodes  = open(workdir+'nodes.tmp', 'r')
   net  = open(workdir+project+'Cite.net', 'w')
   net.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n")
   net.write('*vertices '+str(nver)+'\n')
   shutil.copyfileobj(nodes,net)
   arcs   = open(workdir+'arcs.tmp', 'r')
   shutil.copyfileobj(arcs,net)
   arcs.close(); net.close(); nodes.close()

# works X journals network ***
   print "works X journals network: "+project+'WJ.net'
   nodes  = open(workdir+'nodes.tmp', 'r')
   tj  = open(workdir+'journals.tmp', 'r')
   wj  = open(workdir+project+'WJ.net', 'w')
   wj.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n")
   wj.write('*vertices '+str(nver+njr)+' '+str(nver)+'\n')
   shutil.copyfileobj(nodes,wj)
   nodes.close()
   while True:
     line = tj.readline()
     if not line: break
     s = line.split(" ",1)
     wj.write(str(eval(s[0])+nver)+' '+s[1])
   temp  = open(workdir+'jourlink.tmp', 'r')
   wj.write('*arcs\n')
   while True:
     line = temp.readline()
     if not line: break
     s = line.split(" ")
     wj.write(s[0]+' '+str(eval(s[1])+nver)+'\n')
   temp.close(); wj.close(); tj.close()

# works X keywords network
   print "works X keywords network: "+project+'WK.net'
   nodes  = open(workdir+'nodes.tmp', 'r')
   tk  = open(workdir+'keywords.tmp', 'r')
   wk  = open(workdir+project+'WK.net', 'w')
   wk.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n")
   wk.write('*vertices '+str(nver+naut)+' '+str(nver)+'\n')
   shutil.copyfileobj(nodes,wk)
   nodes.close()
   while True:
     line = tk.readline()
     if not line: break
     s = line.split(" ",1)
     wk.write(str(eval(s[0])+nver)+' '+s[1])
   temp  = open(workdir+'keylink.tmp', 'r')
   wk.write('*arcs\n')
   while True:
     line = temp.readline()
     if not line: break
     s = line.split(" ")
     wk.write(s[0]+' '+str(eval(s[1])+nver)+'\n')
   temp.close(); wk.close(); tk.close()

# works X authors network
   print "works X authors  network: "+project+'WA.net'
   nodes  = open(workdir+'nodes.tmp', 'r')
   taut  = open(workdir+'authors.tmp', 'r')
   wa  = open(workdir+project+'WA.net', 'w')
   wa.write("% created by WoS2Pajek "+datetime.datetime.now().ctime()+"\n")
   wa.write('*vertices '+str(nver+naut)+' '+str(nver)+'\n')
   shutil.copyfileobj(nodes,wa)
   nodes.close()
   while True:
     line = taut.readline()
     if not line: break
     s = line.split(" ")
     wa.write(str(eval(s[0])+nver)+' '+s[1])
   temp  = open(workdir+'works.tmp', 'r')
   wa.write('*arcs\n')
   while True:
     line = temp.readline()
     if not line: break
     s = line.split(" ")
     wa.write(s[0]+' '+str(eval(s[1])+nver)+'\n')
   temp.close(); wa.close(); taut.close()
   try:
     os.remove(workdir+'works.tmp')
     os.remove(workdir+'authors.tmp')
     os.remove(workdir+'years.tmp')
     os.remove(workdir+'arcs.tmp')
     os.remove(workdir+'nodes.tmp')
     os.remove(workdir+'keywords.tmp')
     os.remove(workdir+'keylink.tmp')
     os.remove(workdir+'journals.tmp')
     os.remove(workdir+'jourlink.tmp')
   except:
     print "unable to delete some temp files"
   t2 = datetime.datetime.now()
   print "finished: "+t2.ctime()
   print "time used: ", t2-t1
   print "***"

#
# Run Wos2Pajek
#
global comlin
if __name__ == '__main__':
   comlin = True
   print "\n*** WoS2Pajek - 0.5"
   print "by V. Batagelj, June 24, 2008 / March 23, 2007\n"
   if len(sys.argv) == 9:
      for x in sys.argv[1:]: print x
      print "------------------------"
      wdir = sys.argv[1]
      MLdir = sys.argv[2]
      project = sys.argv[3]
      bibfile = sys.argv[4]
      maxvert = eval(sys.argv[5])
      ISIn = eval(sys.argv[6])
      makeClean = eval(sys.argv[7])
      listep = eval(sys.argv[8])
   else:
      print "8 arguments required to run !"
      wdir    = input("WoS directory         = ")
      MLdir   = input("MontyLingua directory = ")
      project = input("project subdirectory  = ")
      bibfile = input("WoS file              = ")
      maxvert = input("max num of vertices   = ")
      ISIn    = input("ISInumber (True/False = ")
      makeClean = input("makeClean (True/False = ")
      listep  = input("list step             = ")
   if MLdir == '':
      MLdir = r'c:\Python25\Lib\site-packages\MontyLingua-2.1\Python'
   sys.path.append(MLdir)
   run(wdir,MLdir,project,bibfile,maxvert,ISIn,makeClean,listep)
   print
   a = input("Close console?")
else:
   comlin = False
   print "Module Wos2Pajek imported.\n"
   print "To run, type:"
   print "  Wos2Pajek.run(wdir,MLdir,project,bibfile,maxvert,ISIn,makeClean,listep)"
   print "for example:"
   print "  Wos2Pajek.run(r'D:\Vlado\work\Python\WoS',r'c:\Python25\Lib\site-packages\MontyLingua-2.1\Python','SN','SN.WoS',200000,False,True,500)\n"

#- End -------------------------------------------------------------------------------