epintable.py from sqlalchemy import create_engine, ForeignKey
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Column, Date, Integer, String, Boolean
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, backref
Base = declarative_base()
class Product(Base):
__tablename__ = 'product'
id = Column(Integer, primary_key = True)
url = Column(String, unique = True, nullable = False)
name = Column(String, nullable = False)
category = Column(String, nullable = False)
def __init__(self, name, category, url):
self.name = name
self.category = category
self.url = url
def __repr__(self):
return u" " % (self.id, self.name, self.category, self.url)
def get_persistent(self, session):
return session.query(Product).filter(Product.url==self.url).first()
class Reviewer(Base):
__tablename__ = 'reviewer'
id = Column(Integer, primary_key = True)
url = Column(String, unique = True, nullable = False)
username = Column(String, nullable = False)
def get_persistent(self, session):
return session.query(Reviewer).filter(Reviewer.url==self.url).first()
def __init__(self, username, url):
self.url = url
self.username = username
def __eq__(self, other):
return self.url == other.url
def __hash__(self):
return hash(self.url)
def __repr__(self):
return u"" % (self.id, self.username, self.url)
class Review(Base):
__tablename__ = 'review'
id = Column(Integer, primary_key = True)
url = Column(String, unique = True, nullable = False)
date = Column(Date, nullable = False)
rating = Column(Integer, nullable = False)
reviewer_id = Column(Integer, ForeignKey("reviewer.id"))
reviewer = relationship("Reviewer", backref = backref("reviews", order_by=id))
product_id = Column(Integer, ForeignKey("product.id"))
product = relationship("Product", backref = backref("reviews", order_by=id))
def get_persistent(self, session):
return session.query(Review).filter(Review.url==self.url).first()
def __init__(self, date, rating, url):
self.date = date
self.rating = rating
self.url = url
def __repr__(self):
return u"" % (self.id, self.date, self.rating, self.reviewer_id, self.product_id)
class AttributeRating(Base):
__tablename__ = 'detailed_rating'
id = Column(Integer, primary_key = True)
attribute = Column(String)
rating = Column(Integer)
review_id = Column(Integer, ForeignKey("review.id"))
review = relationship("Review", backref = backref("attr_ratings"), order_by = id)
def __init__(self, attribute, rating):
self.attribute = attribute
self.rating = rating
def __repr__(self):
return "" % (self.attribute, self.rating, self.review_id)
def get_engine(filename):
engine = create_engine('sqlite:///%s' % (filename), echo=False)
Base.metadata.create_all(engine)
return engine
epinhelper.py import urllib2
import socket
import time
import re
import lxml.html
import datetime
import sys
socket.setdefaulttimeout(40)
def retry(ExceptionToCheck, tries=4, delay=3, backoff=4, logger=None):
"""Retry calling the decorated function using an exponential backoff.
http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/
original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry
:param ExceptionToCheck: the exception to check. may be a tuple of
excpetions to check
:type ExceptionToCheck: Exception or tuple
:param tries: number of times to try (not retry) before giving up
:type tries: int
:param delay: initial delay between retries in seconds
:type delay: int
:param backoff: backoff multiplier e.g. value of 2 will double the delay
each retry
:type backoff: int
:param logger: logger to use. If None, print
:type logger: logging.Logger instance
"""
def deco_retry(f):
def f_retry(*args, **kwargs):
mtries, mdelay = tries, delay
while mtries > 1:
try:
return f(*args, **kwargs)
except ExceptionToCheck, e:
msg = "%s, Retrying in %d seconds..." % (str(e), mdelay)
if logger:
logger.warning(msg)
else:
print msg
time.sleep(mdelay)
mtries -= 1
mdelay *= backoff
return f(*args, **kwargs)
return f_retry # true decorator
return deco_retry
@retry(urllib2.URLError, tries=4, delay=3, backoff=4)
def urlopen_with_retry(address):
return urllib2.urlopen(address)
def get_html(page_address):
try:
return urlopen_with_retry(page_address).read()
except urllib2.URLError:
print "Couldn't open %s" % (page_address)
raise
def parse_url(url):
html_doc = get_html(url)
parsed_html = lxml.html.fromstring(html_doc)
parsed_html.make_links_absolute('http://www.epinions.com')
return parsed_html
def convert_date(date):
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
match = re.search(r'^(\w\w\w).(\d\d).\'(\d\d)$', date)
month_num = months.index(match.group(1)) + 1
short_year = int(match.group(3))
year = 1900 + short_year if short_year > 90 else 2000 + short_year
return datetime.date(year, month_num, int(match.group(2)))
def count_pages(parsed_page, url_template):
regex = re.compile('^%s$' % url_template.format(pagenum='(\d+)'))
page_links = [i for i in parsed_page.xpath('//a/@href') if i is not None and re.search(regex, i)]
page_nums = [int(re.search(regex,i).group(1)) for i in page_links]
return max(page_nums) if page_nums else 1
def get_paged_data(get_data_from_page):
def f(url_template):
parsed_page = parse_url(url_template.format(pagenum = '1'))
Npages = count_pages(parsed_page, url_template)
for i in range(1, Npages+1):
parsed_url = parsed_page if i == 1 else parse_url(url_template.format(pagenum=str(i)))
for j in get_data_from_page(parsed_url):
yield j
return f
def unzip(original):
return ([ a for a,b in original ], [ b for a,b in original ])
epinparsers.py import lxml.html
from lxml import etree
import re
import datetime
from epinhelper import parse_url, convert_date, get_paged_data
from epintable import Product, Reviewer, Review, AttributeRating, ReviewContents
def top_authors_from_page(parsed_html):
tags = parsed_html.xpath('//div[@]/table[2]\
/tr/td[3]/table[3]/tr/td[2]/span/b/a')
return [Reviewer(t.text, t.get('href')) for t in tags]
def top_authors_list(url):
template = url + '/pp_{pagenum}'
top_authors = get_paged_data(top_authors_from_page)
return top_authors(template)
def parse_author_review_record(review_tag):
try:
date_written = convert_date(review_tag.xpath('td[1]/span/text()')[0].strip())
review_url = review_tag.xpath('td[2]/span/b/a/@href')[0]
rating_re = re.compile('http://img.epinions.com/images/epi_images/e3/(\d)_med_stars.gif')
rating = int(re.match(rating_re,review_tag.xpath('td[5]/span/img/@src')[0]).group(1))
prod_url = review_tag.xpath('td[3]/span[1]/a/@href')[0]
prod_name = review_tag.xpath('td[3]/span[1]/a/text()')[0]
prod_category = review_tag.xpath('td[3]/span[2]/a/text()')[0]
return Review(date_written, rating, review_url), Product(prod_name, prod_category, prod_url)
except:
return None
def author_reviews_from_page(parsed_html):
path = '/html/body/div[3]/div/table[2]/tr/td[2]/table/tr/td/table[2]/tr'
records = parsed_html.xpath(path)[1:-1][::2]
return [i for i in [parse_author_review_record(r) for r in records] if i is not None]
def author_reviews(rev):
template = rev.url + "/sec_public_profile_opinion_list/show_content/contype_opinion/pp_{pagenum}/pa_1#list"
review_records = get_paged_data(author_reviews_from_page)
return review_records(template)
def get_trust_paged(parsed_html):
trust_tags = parsed_html.xpath('//div[@class = "body_container_padded"]\
/table[2]/tr/td[2]/table/tr/td/table[2]/tr/td[1]/span/a/b/../../../..')
author_urls = [t.xpath('td[1]/span/a/@href')[0] for t in trust_tags]
dates_raw = [t.xpath('td[3]/span/text()')[0].strip() for t in trust_tags]
dates = [convert_date(i) if i != '-' else datetime.date(2001,1,1) for i in dates_raw]
return [(Reviewer(a), d) for a, d in zip(author_urls, dates)]
def trusted_by(reviewer):
template = reviewer.url + "/sec_WOT_list/show_trust/pp_{pagenum}/pa_1/contype_trustedby"
trust_records = get_paged_data(get_trust_paged)
return trust_records(template)
def trusts(reviewer):
template = reviewer.url + "/sec_WOT_list/show_trust/pp_{pagenum}/pa_1"
profile = parse_url(reviewer.url)
path = '/html/body/div[3]/div[1]/table[2]/tr[1]\
/td[1]/table/tr/td/table/tr/td/table/tr[3]/td/span[1]/text()'
trust_status = not bool(re.search(r'\bhidden\b',profile.xpath(path)[0]))
trust_records = get_paged_data(get_trust_paged)
return trust_status, trust_records(template)
def get_attribute_ratings(parsed_html):
tags = parsed_html.xpath('//ul[@class = "user_review_chart"]/li')[1:]
attributes = [i.xpath('span[1]/text()')[0][:-1] for i in tags]
ratings_raw = [i.xpath('span[2]/img/@src')[0] for i in tags]
rating_regexp = re.compile(r'http://img.epinions.com/images/epi_images/e3/quant_(\d).gif')
ratings = [int(re.search(rating_regexp, i).group(1))
for i in ratings_raw]
attr_ratings = [AttributeRating(attr, r) for (attr, r) in zip(attributes, ratings)]
return attr_ratings
def get_review_contents(parsed_html):
review_summary = {}
rs_tags = parsed_html.xpath('//div[@ or @]/span')
for tag in rs_tags:
b = list(tag.iterfind('b'))[0]
review_attr = re.search(r'(\S.*\S):', b.text).group(1)
review_summary[review_attr] = etree.tostring(tag).strip()
summary_content = set(['Pros', 'Cons', 'The Bottom Line'])
for i in summary_content.difference(set(review_summary.keys())):
review_summary[i] = None
review_text = etree.tostring(parsed_html.xpath('//p[contains(@class, "description")]')[0])
m = re.search(r'Recommended:\s*(Yes|No)', review_text)
if m:
is_recommended = bool(['No', 'Yes'].index(m.group(1)))
else:
is_recommended = None
review_contents = ReviewContents(review_summary['Pros'], review_summary['Cons'],
review_summary['The Bottom Line'], is_recommended)
return review_contents
def get_review_details(review):
parsed_html = parse_url(review.url)
return get_review_contents(parsed_html), get_attribute_ratings(parsed_html)
epcrawler.py class EpCrawler:
def __init__(self, dbfilename, pkfilename, starting_set):
self.dbfilename = dbfilename
self.pkfilename = pkfilename
self.session = get_session(dbfilename)
self.frontier = starting_set
self.explored = set()
self.failed = []
self.session.add_all(self.frontier)
self.session.commit()
def save(self):
with open(self.pkfilename,'wb') as outfile:
pickle.dump(self, outfile)
def run(self, time_limit):
start_time = time.time()
while self.frontier and time.time() - start_time < time_limit:
current_node = self.frontier[0]
try:
trust_stat, trust_iter = trusts(current_node)
current_node.show_trust = trust_stat
trust_dict = dict(trust_iter) if trust_stat else {}
trusted_by_dict = dict(trusted_by(current_node))
distinct_connected = set(trust_dict.keys() + trusted_by_dict.keys())
newly_added = distinct_connected.difference(self.frontier
+ list(self.explored) + self.failed)
for rev in set(trust_dict.keys()).difference(self.explored):
current_node.add_trust(rev,trust_dict[rev])
for rev in set(trusted_by_dict.keys()).difference(self.explored):
rev.add_trust(current_node,trusted_by_dict[rev])
except (urllib2.URLError, socket.error):
print "There was a connection error while processing %s" % (current_node)
self.session.rollback()
self.frontier.pop(0)
self.failed.append(current_node)
except:
print "Unexpected error"
self.session.rollback()
self.save()
raise
else:
self.session.commit()
self.frontier.pop(0)
self.explored.add(current_node)
self.frontier += list(newly_added)
self.save()
def __getstate__(self):
odict = self.__dict__.copy()
del odict['session']
return odict
def __setstate__(self, dict):
self.__dict__.update(dict)
self.session = get_session(dict['dbfilename'])
self.session.add_all(self.frontier)
epindb.py class EpinDB():
def __init__(self, filename):
self.engine = get_engine(filename)
Session = sessionmaker(bind = self.engine)
self.session = Session()
def smart_add(self, obj):
persist_obj = obj.get_persistent(self.session)
if persist_obj:
return persist_obj
else:
self.session.add(obj)
return obj
def add_connections(rev, trust_iter):
add_dates = dict(trust_iter)
for trusted_r in add_dates.keys():
if trusted_r in network_reviewers:
rev.add_trust(session.merge(trusted_r), add_dates[trusted_r])
def add_user_connections(self, user):
try:
trust_status, trust_iter = trusts(user)
user.show_trust = trust_status
if trust_status:
add_connections(user, trust_iter)
except (urllib2.URLError, socket.error):
self.session.rollback()
print "Couldn't add connections for user: %s" % (user)
except KeyboardInterrupt:
self.session.rollback()
raise
except Exception, err:
self.session.rollback()
sys.stderr.write('Unexpected error: %s\n' % str(err))
print "Couldn't add connections for user: %s" % (user)
else:
self.session.commit()
def add_review_details(self, review):
try:
contents, attr_rat = get_review_details(review)
contents.review = review
self.session.add(contents)
for i in attr_rat:
i.review = review
self.session.add(i)
except (urllib2.URLError, socket.error):
self.session.rollback()
print "Couldn't add details for review: %s" % (review)
except KeyboardInterrupt:
self.session.rollback()
raise
except Exception, err:
self.session.rollback()
sys.stderr.write('Unexpected error: %s\n' % str(err))
print "Couldn't add details for review: %s" % (review)
else:
self.session.commit()
def add_user_reviews(self, user):
try:
for r in author_reviews(user):
rev, prod = r
prod = self.smart_add(prod)
rev.reviewer = user
rev.product = prod
rev = self.smart_add(rev)
except (urllib2.URLError, socket.error):
self.session.rollback()
print "Couldn't add reviews for user: %s" % (user)
except KeyboardInterrupt:
self.session.rollback()
raise
except Exception, err:
self.session.rollback()
sys.stderr.write('Unexpected error: %s\n' % str(err))
print "Couldn't add reviews for user: %s" % (user)
else:
self.session.commit()
transform_user_reviews.r setwd("C:/Users/const/Desktop/current/epinions/analysis/r")
library(plyr)
library(DBI)
library(RSQLite)
dbPath <- "crawl1.sqlite"
output_file <- "dataset_new.csv"
drv <- dbDriver("SQLite")
db <- dbConnect(drv, dbPath)
myQuery <- "select revcats.reviewer_id reviewer_id, revcats.aggr_category category,
sum(case when reviews.review_url is null then 0 else 1 end) reviews
from
(
select reviewer.id reviewer_id, cat.new_category aggr_category
from reviewer join (select distinct new_category from catmap1) cat
) revcats
left outer join (
select review.reviewer_id reviewer_id,
cat.new_category aggr_category, review.url review_url
from review inner join product
on review.product_url = product.url inner join catmap1 cat
on product.category = cat.category
) reviews
on revcats.reviewer_id = reviews.reviewer_id and
revcats.aggr_category = reviews.aggr_category
group by reviewer_id, category
order by reviewer_id, category"
rawdata <- dbGetQuery(db, myQuery)
categories <- sort(unique(rawdata$category))
mydata <-ddply(rawdata, .(reviewer_id), function(df) df$reviews)
names(mydata) <- c("reviewer_id", categories)
write.table(mydata, file = output_file, quote = FALSE, sep = ";",
row.names = FALSE)
get_rating_data.r setwd("C:/Users/const/Desktop/current/epinions/analysis/r")
library(plyr)
library(DBI)
library(RSQLite)
dbPath <- "crawl1.sqlite"
output_file <- "cameras_new.csv"
drv <- dbDriver("SQLite")
db <- dbConnect(drv, dbPath)
query_temptable <- "create temp table cat_rating as
select detr.*
from detailed_rating detr inner join review
on detr.review_id = review.id inner join product
on review.product_id = product.id
where product.category = 'Digital Cameras'"
dbSendQuery(db, query_temptable)
query_rawdata <- "select rev.*, attr.*, cat_rating.rating
from (
select distinct review_id
from cat_rating
) rev inner join (
select distinct attribute
from cat_rating
) attr left outer join cat_rating
on rev.review_id = cat_rating.review_id and
attr.attribute = cat_rating.attribute
order by review_id, attribute"
query_reviews <- "select id review_id, reviewer_id, rating overall
from review
where review_id in (select review_id from cat_rating)"
rawdata <- dbGetQuery(db, query_rawdata)
reviews <- dbGetQuery(db, query_reviews)
attributes <- sort(unique(rawdata$attribute))
mydata <-ddply(rawdata, .(review_id), function(df) df$rating)
names(mydata) <- c("review_id", attributes)
mydata <- merge(mydata, reviews, by = "review_id")
write.table(mydata, file = output_file, quote = FALSE, sep = ";",
row.names = FALSE)
|