Главная страница

Для Герасимова 2. Разработка модели потребительских предпочтений на основе данных рекомендательной сети epinions com


Скачать 2.87 Mb.
НазваниеРазработка модели потребительских предпочтений на основе данных рекомендательной сети epinions com
Дата11.06.2022
Размер2.87 Mb.
Формат файлаdocx
Имя файлаДля Герасимова 2.docx
ТипДокументы
#584719
страница12 из 16
1   ...   8   9   10   11   12   13   14   15   16

Приложение Б. Исходные тексты программ

epintable.py


from sqlalchemy import create_engine, ForeignKey

from sqlalchemy.orm import sessionmaker

from sqlalchemy import Column, Date, Integer, String, Boolean

from sqlalchemy.ext.declarative import declarative_base

from sqlalchemy.orm import relationship, backref

Base = declarative_base()

class Product(Base):

__tablename__ = 'product'

id = Column(Integer, primary_key = True)

url = Column(String, unique = True, nullable = False)

name = Column(String, nullable = False)

category = Column(String, nullable = False)

def __init__(self, name, category, url):

self.name = name

self.category = category

self.url = url

def __repr__(self):

return u"
" % (self.id, self.name, self.category, self.url)

def get_persistent(self, session):

return session.query(Product).filter(Product.url==self.url).first()

class Reviewer(Base):

__tablename__ = 'reviewer'

id = Column(Integer, primary_key = True)

url = Column(String, unique = True, nullable = False)

username = Column(String, nullable = False)

def get_persistent(self, session):

return session.query(Reviewer).filter(Reviewer.url==self.url).first()

def __init__(self, username, url):

self.url = url

self.username = username

def __eq__(self, other):

return self.url == other.url

def __hash__(self):

return hash(self.url)

def __repr__(self):

return u"" % (self.id, self.username, self.url)

class Review(Base):

__tablename__ = 'review'

id = Column(Integer, primary_key = True)

url = Column(String, unique = True, nullable = False)

date = Column(Date, nullable = False)

rating = Column(Integer, nullable = False)

reviewer_id = Column(Integer, ForeignKey("reviewer.id"))

reviewer = relationship("Reviewer", backref = backref("reviews", order_by=id))

product_id = Column(Integer, ForeignKey("product.id"))

product = relationship("Product", backref = backref("reviews", order_by=id))

def get_persistent(self, session):

return session.query(Review).filter(Review.url==self.url).first()

def __init__(self, date, rating, url):

self.date = date

self.rating = rating

self.url = url

def __repr__(self):

return u"" % (self.id, self.date, self.rating, self.reviewer_id, self.product_id)

class AttributeRating(Base):

__tablename__ = 'detailed_rating'

id = Column(Integer, primary_key = True)

attribute = Column(String)

rating = Column(Integer)

review_id = Column(Integer, ForeignKey("review.id"))

review = relationship("Review", backref = backref("attr_ratings"), order_by = id)

def __init__(self, attribute, rating):

self.attribute = attribute

self.rating = rating

def __repr__(self):

return "" % (self.attribute, self.rating, self.review_id)

def get_engine(filename):

engine = create_engine('sqlite:///%s' % (filename), echo=False)

Base.metadata.create_all(engine)

return engine

epinhelper.py


import urllib2

import socket

import time

import re

import lxml.html

import datetime

import sys

socket.setdefaulttimeout(40)

def retry(ExceptionToCheck, tries=4, delay=3, backoff=4, logger=None):

"""Retry calling the decorated function using an exponential backoff.

http://www.saltycrane.com/blog/2009/11/trying-out-retry-decorator-python/

original from: http://wiki.python.org/moin/PythonDecoratorLibrary#Retry

:param ExceptionToCheck: the exception to check. may be a tuple of

excpetions to check

:type ExceptionToCheck: Exception or tuple

:param tries: number of times to try (not retry) before giving up

:type tries: int

:param delay: initial delay between retries in seconds

:type delay: int

:param backoff: backoff multiplier e.g. value of 2 will double the delay

each retry

:type backoff: int

:param logger: logger to use. If None, print

:type logger: logging.Logger instance

"""

def deco_retry(f):

def f_retry(*args, **kwargs):

mtries, mdelay = tries, delay

while mtries > 1:

try:

return f(*args, **kwargs)

except ExceptionToCheck, e:

msg = "%s, Retrying in %d seconds..." % (str(e), mdelay)

if logger:

logger.warning(msg)

else:

print msg

time.sleep(mdelay)

mtries -= 1

mdelay *= backoff

return f(*args, **kwargs)

return f_retry # true decorator

return deco_retry

@retry(urllib2.URLError, tries=4, delay=3, backoff=4)

def urlopen_with_retry(address):

return urllib2.urlopen(address)

def get_html(page_address):

try:

return urlopen_with_retry(page_address).read()

except urllib2.URLError:

print "Couldn't open %s" % (page_address)

raise

def parse_url(url):

html_doc = get_html(url)

parsed_html = lxml.html.fromstring(html_doc)

parsed_html.make_links_absolute('http://www.epinions.com')

return parsed_html

def convert_date(date):

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

match = re.search(r'^(\w\w\w).(\d\d).\'(\d\d)$', date)

month_num = months.index(match.group(1)) + 1

short_year = int(match.group(3))

year = 1900 + short_year if short_year > 90 else 2000 + short_year

return datetime.date(year, month_num, int(match.group(2)))

def count_pages(parsed_page, url_template):

regex = re.compile('^%s$' % url_template.format(pagenum='(\d+)'))

page_links = [i for i in parsed_page.xpath('//a/@href') if i is not None and re.search(regex, i)]

page_nums = [int(re.search(regex,i).group(1)) for i in page_links]

return max(page_nums) if page_nums else 1

def get_paged_data(get_data_from_page):

def f(url_template):

parsed_page = parse_url(url_template.format(pagenum = '1'))

Npages = count_pages(parsed_page, url_template)

for i in range(1, Npages+1):

parsed_url = parsed_page if i == 1 else parse_url(url_template.format(pagenum=str(i)))

for j in get_data_from_page(parsed_url):

yield j

return f

def unzip(original):

return ([ a for a,b in original ], [ b for a,b in original ])

epinparsers.py


import lxml.html

from lxml import etree

import re

import datetime

from epinhelper import parse_url, convert_date, get_paged_data

from epintable import Product, Reviewer, Review, AttributeRating, ReviewContents

def top_authors_from_page(parsed_html):

tags = parsed_html.xpath('//div[@]/table[2]\

/tr/td[3]/table[3]/tr/td[2]/span/b/a')

return [Reviewer(t.text, t.get('href')) for t in tags]

def top_authors_list(url):

template = url + '/pp_{pagenum}'

top_authors = get_paged_data(top_authors_from_page)

return top_authors(template)

def parse_author_review_record(review_tag):

try:

date_written = convert_date(review_tag.xpath('td[1]/span/text()')[0].strip())

review_url = review_tag.xpath('td[2]/span/b/a/@href')[0]

rating_re = re.compile('http://img.epinions.com/images/epi_images/e3/(\d)_med_stars.gif')

rating = int(re.match(rating_re,review_tag.xpath('td[5]/span/img/@src')[0]).group(1))

prod_url = review_tag.xpath('td[3]/span[1]/a/@href')[0]

prod_name = review_tag.xpath('td[3]/span[1]/a/text()')[0]

prod_category = review_tag.xpath('td[3]/span[2]/a/text()')[0]

return Review(date_written, rating, review_url), Product(prod_name, prod_category, prod_url)

except:

return None

def author_reviews_from_page(parsed_html):

path = '/html/body/div[3]/div/table[2]/tr/td[2]/table/tr/td/table[2]/tr'

records = parsed_html.xpath(path)[1:-1][::2]

return [i for i in [parse_author_review_record(r) for r in records] if i is not None]

def author_reviews(rev):

template = rev.url + "/sec_public_profile_opinion_list/show_content/contype_opinion/pp_{pagenum}/pa_1#list"

review_records = get_paged_data(author_reviews_from_page)

return review_records(template)

def get_trust_paged(parsed_html):

trust_tags = parsed_html.xpath('//div[@class = "body_container_padded"]\

/table[2]/tr/td[2]/table/tr/td/table[2]/tr/td[1]/span/a/b/../../../..')

author_urls = [t.xpath('td[1]/span/a/@href')[0] for t in trust_tags]

dates_raw = [t.xpath('td[3]/span/text()')[0].strip() for t in trust_tags]

dates = [convert_date(i) if i != '-' else datetime.date(2001,1,1) for i in dates_raw]

return [(Reviewer(a), d) for a, d in zip(author_urls, dates)]

def trusted_by(reviewer):

template = reviewer.url + "/sec_WOT_list/show_trust/pp_{pagenum}/pa_1/contype_trustedby"

trust_records = get_paged_data(get_trust_paged)

return trust_records(template)

def trusts(reviewer):

template = reviewer.url + "/sec_WOT_list/show_trust/pp_{pagenum}/pa_1"

profile = parse_url(reviewer.url)

path = '/html/body/div[3]/div[1]/table[2]/tr[1]\

/td[1]/table/tr/td/table/tr/td/table/tr[3]/td/span[1]/text()'

trust_status = not bool(re.search(r'\bhidden\b',profile.xpath(path)[0]))

trust_records = get_paged_data(get_trust_paged)

return trust_status, trust_records(template)

def get_attribute_ratings(parsed_html):

tags = parsed_html.xpath('//ul[@class = "user_review_chart"]/li')[1:]

attributes = [i.xpath('span[1]/text()')[0][:-1] for i in tags]

ratings_raw = [i.xpath('span[2]/img/@src')[0] for i in tags]

rating_regexp = re.compile(r'http://img.epinions.com/images/epi_images/e3/quant_(\d).gif')

ratings = [int(re.search(rating_regexp, i).group(1))

for i in ratings_raw]

attr_ratings = [AttributeRating(attr, r) for (attr, r) in zip(attributes, ratings)]

return attr_ratings

def get_review_contents(parsed_html):

review_summary = {}

rs_tags = parsed_html.xpath('//div[@ or @]/span')

for tag in rs_tags:

b = list(tag.iterfind('b'))[0]

review_attr = re.search(r'(\S.*\S):', b.text).group(1)

review_summary[review_attr] = etree.tostring(tag).strip()

summary_content = set(['Pros', 'Cons', 'The Bottom Line'])

for i in summary_content.difference(set(review_summary.keys())):

review_summary[i] = None

review_text = etree.tostring(parsed_html.xpath('//p[contains(@class, "description")]')[0])

m = re.search(r'Recommended:\s*(Yes|No)', review_text)

if m:

is_recommended = bool(['No', 'Yes'].index(m.group(1)))

else:

is_recommended = None

review_contents = ReviewContents(review_summary['Pros'], review_summary['Cons'],

review_summary['The Bottom Line'], is_recommended)

return review_contents

def get_review_details(review):

parsed_html = parse_url(review.url)

return get_review_contents(parsed_html), get_attribute_ratings(parsed_html)

epcrawler.py


class EpCrawler:

def __init__(self, dbfilename, pkfilename, starting_set):

self.dbfilename = dbfilename

self.pkfilename = pkfilename

self.session = get_session(dbfilename)

self.frontier = starting_set

self.explored = set()

self.failed = []

self.session.add_all(self.frontier)

self.session.commit()

def save(self):

with open(self.pkfilename,'wb') as outfile:

pickle.dump(self, outfile)

def run(self, time_limit):

start_time = time.time()

while self.frontier and time.time() - start_time < time_limit:

current_node = self.frontier[0]

try:

trust_stat, trust_iter = trusts(current_node)

current_node.show_trust = trust_stat

trust_dict = dict(trust_iter) if trust_stat else {}

trusted_by_dict = dict(trusted_by(current_node))

distinct_connected = set(trust_dict.keys() + trusted_by_dict.keys())

newly_added = distinct_connected.difference(self.frontier

+ list(self.explored) + self.failed)

for rev in set(trust_dict.keys()).difference(self.explored):

current_node.add_trust(rev,trust_dict[rev])

for rev in set(trusted_by_dict.keys()).difference(self.explored):

rev.add_trust(current_node,trusted_by_dict[rev])

except (urllib2.URLError, socket.error):

print "There was a connection error while processing %s" % (current_node)

self.session.rollback()

self.frontier.pop(0)

self.failed.append(current_node)

except:

print "Unexpected error"

self.session.rollback()

self.save()

raise

else:

self.session.commit()

self.frontier.pop(0)

self.explored.add(current_node)

self.frontier += list(newly_added)

self.save()

def __getstate__(self):

odict = self.__dict__.copy()

del odict['session']

return odict

def __setstate__(self, dict):

self.__dict__.update(dict)

self.session = get_session(dict['dbfilename'])

self.session.add_all(self.frontier)

epindb.py


class EpinDB():

def __init__(self, filename):

self.engine = get_engine(filename)

Session = sessionmaker(bind = self.engine)

self.session = Session()

def smart_add(self, obj):

persist_obj = obj.get_persistent(self.session)

if persist_obj:

return persist_obj

else:

self.session.add(obj)

return obj

def add_connections(rev, trust_iter):

add_dates = dict(trust_iter)

for trusted_r in add_dates.keys():

if trusted_r in network_reviewers:

rev.add_trust(session.merge(trusted_r), add_dates[trusted_r])

def add_user_connections(self, user):

try:

trust_status, trust_iter = trusts(user)

user.show_trust = trust_status

if trust_status:

add_connections(user, trust_iter)

except (urllib2.URLError, socket.error):

self.session.rollback()

print "Couldn't add connections for user: %s" % (user)

except KeyboardInterrupt:

self.session.rollback()

raise

except Exception, err:

self.session.rollback()

sys.stderr.write('Unexpected error: %s\n' % str(err))

print "Couldn't add connections for user: %s" % (user)

else:

self.session.commit()

def add_review_details(self, review):

try:

contents, attr_rat = get_review_details(review)

contents.review = review

self.session.add(contents)

for i in attr_rat:

i.review = review

self.session.add(i)

except (urllib2.URLError, socket.error):

self.session.rollback()

print "Couldn't add details for review: %s" % (review)

except KeyboardInterrupt:

self.session.rollback()

raise

except Exception, err:

self.session.rollback()

sys.stderr.write('Unexpected error: %s\n' % str(err))

print "Couldn't add details for review: %s" % (review)

else:

self.session.commit()

def add_user_reviews(self, user):

try:

for r in author_reviews(user):

rev, prod = r

prod = self.smart_add(prod)

rev.reviewer = user

rev.product = prod

rev = self.smart_add(rev)

except (urllib2.URLError, socket.error):

self.session.rollback()

print "Couldn't add reviews for user: %s" % (user)

except KeyboardInterrupt:

self.session.rollback()

raise

except Exception, err:

self.session.rollback()

sys.stderr.write('Unexpected error: %s\n' % str(err))

print "Couldn't add reviews for user: %s" % (user)

else:

self.session.commit()

transform_user_reviews.r


setwd("C:/Users/const/Desktop/current/epinions/analysis/r")

library(plyr)

library(DBI)

library(RSQLite)

dbPath <- "crawl1.sqlite"

output_file <- "dataset_new.csv"

drv <- dbDriver("SQLite")

db <- dbConnect(drv, dbPath)

myQuery <- "select revcats.reviewer_id reviewer_id, revcats.aggr_category category,

sum(case when reviews.review_url is null then 0 else 1 end) reviews

from

(

select reviewer.id reviewer_id, cat.new_category aggr_category

from reviewer join (select distinct new_category from catmap1) cat

) revcats

left outer join (

select review.reviewer_id reviewer_id,

cat.new_category aggr_category, review.url review_url

from review inner join product

on review.product_url = product.url inner join catmap1 cat

on product.category = cat.category

) reviews

on revcats.reviewer_id = reviews.reviewer_id and

revcats.aggr_category = reviews.aggr_category

group by reviewer_id, category

order by reviewer_id, category"

rawdata <- dbGetQuery(db, myQuery)

categories <- sort(unique(rawdata$category))

mydata <-ddply(rawdata, .(reviewer_id), function(df) df$reviews)

names(mydata) <- c("reviewer_id", categories)

write.table(mydata, file = output_file, quote = FALSE, sep = ";",

row.names = FALSE)

get_rating_data.r


setwd("C:/Users/const/Desktop/current/epinions/analysis/r")

library(plyr)

library(DBI)

library(RSQLite)

dbPath <- "crawl1.sqlite"

output_file <- "cameras_new.csv"

drv <- dbDriver("SQLite")

db <- dbConnect(drv, dbPath)

query_temptable <- "create temp table cat_rating as

select detr.*

from detailed_rating detr inner join review

on detr.review_id = review.id inner join product

on review.product_id = product.id

where product.category = 'Digital Cameras'"

dbSendQuery(db, query_temptable)

query_rawdata <- "select rev.*, attr.*, cat_rating.rating

from (

select distinct review_id

from cat_rating

) rev inner join (

select distinct attribute

from cat_rating

) attr left outer join cat_rating

on rev.review_id = cat_rating.review_id and

attr.attribute = cat_rating.attribute

order by review_id, attribute"

query_reviews <- "select id review_id, reviewer_id, rating overall

from review

where review_id in (select review_id from cat_rating)"

rawdata <- dbGetQuery(db, query_rawdata)

reviews <- dbGetQuery(db, query_reviews)

attributes <- sort(unique(rawdata$attribute))

mydata <-ddply(rawdata, .(review_id), function(df) df$rating)

names(mydata) <- c("review_id", attributes)

mydata <- merge(mydata, reviews, by = "review_id")

write.table(mydata, file = output_file, quote = FALSE, sep = ";",

row.names = FALSE)
1   ...   8   9   10   11   12   13   14   15   16


написать администратору сайта