Commit f1da2904 authored by gorla's avatar gorla
Browse files

added function to get abreact if DOI links to ACM DL

parent 6092012c
......@@ -10,7 +10,7 @@ class Paper(object):
""" Paper metadata from DBLP. """
def __init__(self, venue, year, identifier, heading, dblp_id, title,
authors, page_range, electronic_edition):
authors, page_range, electronic_edition, abstract):
self.venue = venue
self.year = year
self.identifier = identifier
......@@ -24,6 +24,7 @@ class Paper(object):
self.last_page = -1
self.length = -1
self.electronic_edition = electronic_edition
self.abstract = abstract
self.comment = ""
self.regular_page_range = REGULAR_PAGE_RANGE_REGEX.fullmatch(page_range)
self.numbered_page_range = NUMBERED_PAGE_RANGE_REGEX.fullmatch(page_range)
......@@ -83,15 +84,15 @@ class Paper(object):
def get_column_values(self):
return [self.venue, self.year, self.identifier, self.heading,
self.title, self.authors,
self.page_range, self.length, self.electronic_edition,
self.title, self.authors, self.page_range,
self.length, self.electronic_edition, self.abstract,
self.comment]
@classmethod
def get_column_names(cls):
return ["venue", "year", "identifier", "heading", "title",
"authors", "page_range", "length",
"electronic_edition", "comment"]
"electronic_edition", "abstract", "comment"]
@classmethod
def split_page_range(cls, page_range):
......
......@@ -73,10 +73,21 @@ class Venue(object):
ee = item.xpath('nav[@class="publ"]/ul/li[@class="drop-down"]/div[@class="head"]/a/@href')
if len(ee) > 0:
abstract = ""
# select DOI link if available, first link otherwise
doi_link = [link for link in ee if "doi.org" in link]
if len(doi_link) > 0:
ee = str(doi_link[0])
## FIXME if abstract option is true. get abstract
responseDOI = self.session.get(ee)
if responseDOI.ok:
logger.info("Succesfully retrieved DOI page " + ee)
treeDOI = html.fromstring(responseDOI.content)
## ACM DL
abstract_paragraphs = treeDOI.xpath('//div[@class="abstractSection abstractInFull"]/*')
for par in abstract_paragraphs:
if par.tag == "p":
abstract += (par.xpath('text()')[0])
else:
ee = str(ee[0])
......@@ -99,7 +110,8 @@ class Venue(object):
title,
authors,
pages,
ee
ee,
abstract
))
logger.info("Successfully parsed TOC of venue: " + str(self))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment