Commit 5db7e24e authored by gorla's avatar gorla
Browse files

added abstract and changed output parameter

parent b6045880
......@@ -17,10 +17,10 @@ def get_argument_parser():
dest='input_file'
)
arg_parser.add_argument(
'-o', '--output-dir',
'-o', '--output',
required=True,
help='Path to output directory',
dest='output_dir'
help='Output file',
dest='output_file'
)
arg_parser.add_argument(
'-d', '--delimiter',
......@@ -34,7 +34,11 @@ def get_argument_parser():
action='store_true',
help='download bibtext information (default: False)',
)
arg_parser.add_argument(
'-a', '--abstract',
action='store_true',
help='retrieve abstract from DOI link (default: False)',
)
return arg_parser
......@@ -50,9 +54,9 @@ def main():
if args.bibtex:
venue_list.retrieve_bibtex_entries()
venue_list.validate_page_ranges()
venue_list.write_to_csv(args.output_dir, args.delimiter)
venue_list.write_to_csv(args.output_file, args.delimiter)
if args.bibtex:
venue_list.write_to_bibtex(args.output_dir)
venue_list.write_to_bibtex(args.output_file)
if __name__ == '__main__':
......
import bibtexparser
import logging
import re
import requests
import time
from urllib.request import urlopen
import json
import time
import pdb
from lxml import html
from dblp.paper import Paper
from bs4 import BeautifulSoup
logger = logging.getLogger("dblp-retriever_logger")
......@@ -19,50 +21,108 @@ class Venue(object):
self.year = str(year)
self.identifier = str(identifier)
self.uri = "https://dblp.org/db/" + self.identifier + ".html"
self.uri = "https://dblp.uni-trier.de/db/" + self.identifier + ".html"
self.papers = []
self.bib_entries = []
# session for data retrieval
# session for data retrievalpdb.set_trace()
self.session = requests.Session()
def getAbstract(self, DOIlink):
abstract = ""
responseDOI = self.session.get(DOIlink)
# if too many requests, wait and make another request
if responseDOI.status_code == 429:
time.sleep(int(responseDOI.headers["Retry-After"])+2)
responseDOI = self.session.get(DOIlink)
if responseDOI.ok:
logger.info("Succesfully retrieved DOI page " + DOIlink)
treeDOI = html.fromstring(responseDOI.content.decode('utf-8'))
head_metadata = treeDOI.xpath('//head/meta[@content]')
if len(head_metadata) == 0:
logger.warn("Could not get head metadata while retrieving the DOI link")
return abstract
# ACM DL
if "ACM" in head_metadata[0].attrib.get("content"):
abstract_paragraphs = treeDOI.xpath('//div[@class="abstractSection abstractInFull"]/*')
for par in abstract_paragraphs:
if par.tag == "p":
for t in par.xpath('text()'):
abstract += t
return abstract
# IEEE
if "IEEE" in head_metadata[0].attrib.get("content"):
# I wish this was proper json.. it is not really
json_content = re.search('xplGlobal.document.metadata=(.*)};.*</script>.*<div class="ng2-app"', str(responseDOI.content))
json_content = json_content.group(1)+"}".strip()
abstract = re.search('"abstract":"true",.*"abstract":"(.*?)",".*"abstract":"true"', json_content)
if abstract is None:
return ""
else:
return abstract.group(1)
return abstract
else:
return ""
return ""
def getAbstract(self, DOIlink, title=''):
time.sleep(2)
""" Abstract retrieved from DOI link. might be empty """
abstract = ""
# usenix papers need to be searched by title
if title != '':
titleToSearch = title.lower()
titleToSearch = re.sub("[^0-9a-zA-Z ]+", " ", titleToSearch)
titleToSearch = ' '.join(titleToSearch.split())
titleToSearch = titleToSearch.strip()
titleToSearch = titleToSearch.replace(" ","+")
response = urlopen('https://api.semanticscholar.org/graph/v1/paper/search?query='+titleToSearch+'&fields=title,abstract')
data_json = json.loads(response.read())
abstract = data_json["data"][0]["abstract"]
if abstract is None:
abstract = ""
logger.warning("No abstract for " + DOIlink)
if abstract is not None:
logger.info("Successfully retrieved DOI page " + DOIlink)
return abstract
doi = DOIlink.replace('https://doi.org/', '')
res = requests.get('https://api.semanticscholar.org/' + doi).text
parsed_html = BeautifulSoup(res, 'html.parser')
try:
abstract = parsed_html.findAll('meta', attrs={'name': 'description'}).pop().attrs['content']
logger.info("Successfully retrieved DOI page " + DOIlink)
except IndexError:
logger.warning("No abstract for " + DOIlink)
abstract = ''
return abstract
# responseDOI = self.session.get(DOIlink, headers=headers)
# # if too many requests, wait and make another request
# if responseDOI.status_code == 429:
# logger.warn("Response 429 while retrieving abstract of " + DOIlink)
# time.sleep(int(responseDOI.headers["Retry-After"])+2)
# responseDOI = self.session.get(DOIlink)
# elif responseDOI.ok:
# logger.info("Succesfully retrieved DOI page " + DOIlink)
# treeDOI = html.fromstring(responseDOI.content.decode('utf-8'))
# #Elsevier
# if 'content="Elsevier"' in responseDOI.content.decode('utf-8'):
# abstract_paragraphs = treeDOI.xpath('//section[@class="abstract"]/div/*')
# pdb.set_trace()
# for par in abstract_paragraphs:
# if par.tag == "p":
# for t in par.xpath('text()'):
# abstract += t
# # Springer
# head_metadata = treeDOI.xpath('//head/meta[@name][2]')
# if "Springer" in responseDOI.content.decode('utf-8'):
# abstract_paragraphs = treeDOI.xpath('//section[@class="Abstract"]/*')
# #pdb.set_trace()
# # springer journals are different...
# if len(abstract_paragraphs) == 0:
# abstract_paragraphs = treeDOI.xpath('//section[@data-title="Abstract"]/*')
# for par in abstract_paragraphs:
# if par.tag == "p":
# for t in par.xpath('text()'):
# abstract += t
#
# return abstract
#
# head_metadata = treeDOI.xpath('//head/meta[@content]')
# if len(head_metadata) == 0:
# logger.warn("Could not get head metadata while retrieving the DOI link")
# return abstract
# # ACM DL
# if "ACM" in head_metadata[0].attrib.get("content"):
# abstract_paragraphs = treeDOI.xpath('//div[@class="abstractSection abstractInFull"]/*')
# for par in abstract_paragraphs:
# if par.tag == "p":
# for t in par.xpath('text()'):
# abstract += t
# return abstract
# # IEEE
# if "IEEE" in head_metadata[0].attrib.get("content"):
# # I wish this was proper json.. it is not really
# json_content = re.search('xplGlobal.document.metadata=(.*)};.*</script>.*<div class="ng2-app"', str(responseDOI.content))
# if json_content is None:
# return ""
# json_content = json_content.group(1)+"}".strip()
# abstract = re.search('"abstract":"true",.*"abstract":"(.*?)",".*"abstract":"true"', json_content)
# if abstract is None:
# return ""
# else:
# return abstract.group(1)
# return abstract
# else:
# return abstract
# else:
# logger.warn(str(DOIlink) + "not retrieved. Error number:" + str(responseDOI.status_code))
# return abstract
def retrieve_papers(self):
try:
......@@ -81,7 +141,6 @@ class Venue(object):
items = tree.xpath('//header[not(@class)]/h2 | //header[not(@class)]/h3 | //ul[@class="publ-list"]/li')
current_heading = ""
##FIXME some venues do not have sessions (e.g. SP2020 or SP2013). This makes this function retrieve 0 papers.
for item in items:
if item.tag == "h2" or item.tag == "h3":
heading = item.xpath("descendant-or-self::*/text()")
......@@ -91,23 +150,15 @@ class Venue(object):
else:
current_heading = ""
elif item.tag == "li":
if current_heading == "":
# the following only works for conferences, not for journals
# year = item.xpath('div[@class="data"]/span[@itemprop="datePublished"]/text()')
# if len(year) > 0:
# year = str(year[0])
# else:
# year = ""
continue
paper_id = item.xpath('@id')[0]
title = item.xpath('cite[@class="data"]/span[@itemprop="name"]/descendant-or-self::*/text()')
title = item.xpath('cite[@class="data tts-content"]/span[@itemprop="name"]/descendant-or-self::*/text()')
if len(title) > 0:
title = str(" ".join(str(element).strip() for element in title))
else:
title = ""
pages = item.xpath('cite[@class="data"]/span[@itemprop="pagination"]/text()')
pages = item.xpath('cite[@class="data tts-content"]/span[@itemprop="pagination"]/text()')
if len(pages) > 0:
pages = str(pages[0])
else:
......@@ -117,24 +168,41 @@ class Venue(object):
if len(ee) > 0:
abstract = ""
# select DOI link if available, first link otherwise
doi_link = [link for link in ee if "doi.org" in link]
doi_link = [link for link in ee if "doi.org" in link]
if len(doi_link) > 0:
ee = str(doi_link[0])
# if withAbstract:
abstract = self.getAbstract(ee)
else:
ee = str(ee[0])
attempts = 0
short_title = title
while abstract == "" and attempts < 5:
try:
abstract = self.getAbstract(ee, short_title)
except IndexError:
# FIXME: some titles give problems so we try to remove words and search again
# for the shorter title. Workaround that works sometimes, but a better fix is
# required.
short_title = short_title.rsplit(' ', 1)[0]
logger.error("attempt "+str(attempts)+": Weird index error for "+short_title)
finally:
attempts = attempts+1
else:
ee = ""
authors = item.xpath('cite[@class="data"]/span[@itemprop="author"]/a/span[@itemprop="name"]'
authors = item.xpath('cite[@class="data tts-content"]/span[@itemprop="author"]/a/span[@itemprop="name"]'
'/text()')
# if there are no authors it means that it is
# the first entry of he proceedings, which
# should be skipped.
if len(authors) == 0:
continue
if len(authors) == 1:
authors = str(authors[0])
else:
authors = "; ".join(authors)
self.papers.append(Paper(
self.name,
self.year,
......@@ -162,8 +230,6 @@ class Venue(object):
for paper in self.papers:
bibtex = paper.get_bibtex()
self.bib_entries.append(bibtex)
# bib_database = bibtexparser.loads(self.papers[0].get_bibtex())
# print(bib_database.entries[1].get('title'))
def validate_page_ranges(self):
logger.info("Sorting papers of venue: " + str(self))
......
......@@ -64,7 +64,8 @@ class VenueList(object):
for venue in self.venues:
venue.validate_page_ranges()
def write_to_bibtex(self, output_dir):
def write_to_bibtex(self, output_file):
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
......@@ -78,7 +79,7 @@ class VenueList(object):
for bib_entry in venue.bib_entries:
bfp.writelines(bib_entry)
def write_to_csv(self, output_dir, delimiter):
def write_to_csv(self, output_file, delimiter):
"""
Export papers retrieved from venues to a CSV file.
:param output_dir: Target directory for generated CSV file.
......@@ -89,15 +90,14 @@ class VenueList(object):
logger.info("Nothing to export.")
return
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, self.filename)
# write paper list to UTF8-encoded CSV file (see also
# http://stackoverflow.com/a/844443)
with codecs.open(file_path, 'w', encoding='utf8') as fp:
logger.info('Exporting papers to ' + file_path + '...')
with codecs.open(output_file, 'w', encoding='utf8') as fp:
logger.info('Exporting papers to ' + output_file + '...')
writer = csv.writer(fp, delimiter=delimiter)
column_names = Paper.get_column_names()
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment