#!/usr/bin/python ############################################################################# # # Get content of directory # # SnNodes.run() # # import sys; sys.path.append(r'D:\Vlado\work\Python\Garfield') # # import SnNodes; SnNodes.run(0) # # reload(SnNodes); SnNodes.run(0) # # Vladimir Batagelj, 17. march 2007 # # http://garfield.library.upenn.edu/histcomp/social-network_to/node/28.html # ############################################################################### import urllib, urlparse, string, os def run(start): # workdir = 'd:\\vlado\\work\\python\\garfield\\SN\\' urlh = 'http://garfield.library.upenn.edu/histcomp/social-network_to/' for pagenum in range(start,5876): pagename = str(pagenum) + '.html' url = urlh + 'node/' + pagename for t in range(10): try: page = urllib.urlopen(url) a = page.read() if len(a) > 0: try: titr = string.index(a,'') titl = a[18:titr] except ValueError: titr = 0 titl = '*****' print(str(pagenum)+': '+titl) if titr > 0: save = open(workdir+'nodes\\'+pagename, 'w') save.write(a) save.close() page.close() break except IOError: print('->'+str(t))