#!/usr/bin/python
#############################################################################
#
# Amazon - Harvesting from Amazon.com
#
# file: amazon.net
# amazon.run('oldAmazon.net',nver0,narc,maxver)
#
# import sys
# sys.path.append(r'D:\Vlado\work\Python\Amazon')
#
# import amazon
# amazon.run('AmazonOld.net',14187,68860,100000)
# amazon.run('AmazonOld.net',17017,82520,100000)
# amazon.run('AmazonOld.net',39546,189930,500000)
# amazon.run('AmazonOld.net',51229,245187,500000)
# amazon.run('AmazonOld.net',65392,311572,500000)
# amazon.run('AmazonOld.net',67389,320844,500000)
# amazon.run('AmazonOld.net',84328,399474,500000)
# amazon.run('AmazonOld.net',92547,437163,500000)
# amazon.run('AmazonOld.net',100695,474562,500000)
# amazon.run('AmazonOld.net',132680,618564,500000)
# amazon.run('AmazonOld.net',144952,672480,500000)
# amazon.run('AmazonOld.net',145682,675781,500000)
# amazon.run('AmazonOld.net',160397,739653,500000)
# amazon.run('AmazonOld.net',171164,787017,500000)
# amazon.run('AmazonOld.net',196255,895208,500000)
# amazon.run('AmazonOld.net',209144,949874,500000)
#
# Vladimir Batagelj, 5/19. june 2004
#
###############################################################################
import urllib, urlparse, string, os
def new_vertex(bk,ti,au):
global nver, vert, books, maxver, nodes, titles, trace, narc
nver = nver + 1
trace.write(str(nver) + ' ' + str(narc) + '\n')
print 'New vertex(', nver, '): ' + bk
vert[bk] = nver
nodes.write(str(nver)+" '"+book+"'\n")
titles.write(str(nver)+" '"+book+"' "+au+" --> "+ti+"\n")
if nver < maxver: books.append(bk)
def new_arc(vini,vter):
global arcs, trace, narc, nver
narc = narc + 1
print 'New arc ', narc, ': (', vini, ',', vter, ')'
arcs.write(str(vini) + ' ' + str(vter) + '\n')
trace.write(str(nver) + ' ' + str(narc) + '\n')
def inspect(bk):
global lstr, rstr, lref, rtit, vert, nver, url1, book, titl, \
auth, url2, books, maxver, b, narc
vini = vert[bk]
url = url1 + bk + url2
page = urllib.urlopen(url)
a = page.read()
try:
lind = string.index(a,lstr) + len(lstr)
except ValueError:
lind = len(a)
try:
rind = string.index(a,rstr,lind)
except ValueError:
rind = len(a)
b = a[lind:rind]
while len(b) > 10:
try: # find '',linr)
url = b[linr:rinr]
print 'URL : ', url
b = b[rinr+1:]
rint = string.index(b,rtit) # find ''
titl = b[:rint]
b = b[rint + len(rtit):]
try:
lina = string.index(b,'by')
except ValueError:
auth = 'by UNKNOWN AUTHOR'
else:
rina = string.index(b,'\n',lina)
auth = b[lina:rina]
b = b[rina+1:]
try:
linu = string.index(url,'detail/-/') + 9
except ValueError:
print '*** nonstandard URL ***'
else:
try:
rinu = string.index(url,'?',linu)
except ValueError:
rinu = string.index(url,'/',linu)
book = url[linu:rinu]
print 'book: ', book, ' ', auth
if vert.has_key(book):
vter = vert[book]
else:
new_vertex(book,titl,auth)
vter = nver
new_arc(vini,vter)
def harvest():
global books
while books:
book= books[0]
del books[0]
inspect(book)
print '\nHarvest finished\n'
def run(vtxfile,nver0,narc0,maxver0):
global lstr, rstr, lref, rtit, vert, nver, url1, book, titl, auth, \
url2, books, nodes, arcs, titles, b, trace, narc, maxver
lstr = ''
rstr = ''
lref = ''
vert = {}
nver = 0
narc = narc0
maxver = maxver0
url1 = 'http://www.amazon.com/exec/obidos/tg/detail/-/'
url2 = '?v=glance'
#
workdir = 'd:\\vlado\\work\\python\\amazon\\'
titles = open(workdir+'amazon.tit', 'w')
trace = open(workdir+'amazon.dat', 'w')
nodes = open(workdir+'amazon.net', 'w')
nodes.write('*vertices \n')
books = []
try:
vtx = open(workdir+vtxfile, 'r')
except:
book = '0761956042'
titl = 'Introducing Social Networks'
auth = 'by Michel Forse (Author), Alain Degenne (Author)'
new_vertex(book,titl,auth)
else:
LL = vtx.readline()
for line in vtx.readlines() :
# print line
LL = line.split(' ');
bk = LL[1].split("'")[1]; bi = int(LL[0])
vert[bk] = bi
if bi > nver0: books.append(bk)
vtx.close()
nver = bi
#
arcs = open(workdir+'amazon.lin', 'w')
arcs.write('*arcs \n')
#
try:
harvest()
except:
print b
print 'book = ', book, ' arc = ', narc
print '\nInterrupted\n'
#
nodes.close()
arcs.close()
titles.close()
trace.close()