Commit 12be6705 authored by gorla's avatar gorla
Browse files

added option to retrieve bibtex and dump it to a separate file for each venue

parent bf145f7d
......@@ -29,6 +29,12 @@ def get_argument_parser():
help='delimiter for CSV files (default: \',\')',
dest='delimiter'
)
arg_parser.add_argument(
'-b', '--bibtex',
action='store_true',
help='download bibtext information (default: False)',
)
return arg_parser
......@@ -41,8 +47,12 @@ def main():
venue_list = VenueList()
venue_list.read_from_csv(args.input_file, args.delimiter)
venue_list.retrieve_papers()
if args.bibtex:
venue_list.retrieve_bibtex_entries()
venue_list.validate_page_ranges()
venue_list.write_to_csv(args.output_dir, args.delimiter)
if args.bibtex:
venue_list.write_to_bibtex(args.output_dir)
if __name__ == '__main__':
......
import logging
import urllib.request
from util.regex import REGULAR_PAGE_RANGE_REGEX, NUMBERED_PAGE_RANGE_REGEX
......@@ -8,11 +9,13 @@ logger = logging.getLogger("dblp-retriever_logger")
class Paper(object):
""" Paper metadata from DBLP. """
def __init__(self, venue, year, identifier, heading, title, authors, page_range, electronic_edition):
def __init__(self, venue, year, identifier, heading, dblp_id, title,
authors, page_range, electronic_edition):
self.venue = venue
self.year = year
self.identifier = identifier
self.heading = heading
self.dblp_id = dblp_id
self.title = title
self.authors = authors
self.page_range = page_range
......@@ -70,13 +73,24 @@ class Paper(object):
def __str__(self):
return str(self.electronic_edition)
def get_bibtex(self):
url = 'http://dblp.org/rec/bib/{}.bib'.format(self.dblp_id)
logger.info("Retrieving bibtex from "+url)
response = urllib.request.urlopen(url)
data = response.read()
text = data.decode('utf-8')
return text
def get_column_values(self):
return [self.venue, self.year, self.identifier, self.heading, self.title, self.authors, self.page_range,
self.length, self.electronic_edition, self.comment]
return [self.venue, self.year, self.identifier, self.heading,
self.title, self.authors,
self.page_range, self.length, self.electronic_edition,
self.comment]
@classmethod
def get_column_names(cls):
return ["venue", "year", "identifier", "heading", "title", "authors", "page_range", "length",
return ["venue", "year", "identifier", "heading", "title",
"authors", "page_range", "length",
"electronic_edition", "comment"]
@classmethod
......
import bibtexparser
import logging
import re
......@@ -20,6 +21,7 @@ class Venue(object):
self.uri = "https://dblp.org/db/" + self.identifier + ".html"
self.papers = []
self.bib_entries = []
# session for data retrieval
self.session = requests.Session()
......@@ -30,7 +32,8 @@ class Venue(object):
response = self.session.get(self.uri)
if response.ok:
logger.info("Successfully retrieved TOC of venue: " + str(self))
logger.info("Successfully retrieved TOC of venue: "
+ str(self))
tree = html.fromstring(response.content)
items = tree.xpath('//header[not(@class)]/h2 | //header[not(@class)]/h3 | //ul[@class="publ-list"]/li')
......@@ -55,6 +58,7 @@ class Venue(object):
# year = ""
continue
paper_id = item.xpath('@id')[0]
title = item.xpath('cite[@class="data"]/span[@itemprop="name"]/descendant-or-self::*/text()')
if len(title) > 0:
title = str(" ".join(str(element).strip() for element in title))
......@@ -91,6 +95,7 @@ class Venue(object):
self.year,
self.identifier,
current_heading,
paper_id,
title,
authors,
pages,
......@@ -99,10 +104,20 @@ class Venue(object):
logger.info("Successfully parsed TOC of venue: " + str(self))
else:
logger.error("An error occurred while retrieving TOC of venue: " + str(self))
logger.error("An error occurred while retrieving TOC of venue: "
+ str(self))
except ConnectionError:
logger.error("An error occurred while retrieving TOC of venue: " + str(self))
logger.error("An error occurred while retrieving TOC of venue: "
+ str(self))
def retrieve_venue_bibtex(self):
logger.info("Retrieving bibtex for venue: " + str(self))
for paper in self.papers:
bibtex = paper.get_bibtex()
self.bib_entries.append(bibtex)
# bib_database = bibtexparser.loads(self.papers[0].get_bibtex())
# print(bib_database.entries[1].get('title'))
def validate_page_ranges(self):
logger.info("Sorting papers of venue: " + str(self))
......@@ -125,13 +140,16 @@ class Venue(object):
if current_paper.regular_page_range and current_paper.first_page != previous_paper.last_page + 1:
current_paper.append_comment("issue_first_page")
previous_paper.append_comment("issue_last_page")
logger.warning("First page of paper " + str(current_paper) + " does not match previous paper "
logger.warning("First page of paper " + str(current_paper)
+ " does not match previous paper "
+ str(previous_paper))
elif current_paper.numbered_page_range and current_paper.article_number != previous_paper.article_number + 1:
current_paper.append_comment("issue_article_number")
previous_paper.append_comment("issue_article_number")
logger.warning("Article number of paper " + str(current_paper) + " does not match previous paper "
logger.warning("Article number of paper "
+ str(current_paper)
+ " does not match previous paper "
+ str(previous_paper))
previous_paper = self.papers[i]
......
......@@ -24,7 +24,8 @@ class VenueList(object):
:param delimiter: Column delimiter in CSV file (typically ',').
"""
# read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443)
# read CSV as UTF-8 encoded file (see also
# http://stackoverflow.com/a/844443)
with codecs.open(input_file, encoding='utf8') as fp:
logger.info("Reading venues from " + input_file + "...")
......@@ -42,9 +43,9 @@ class VenueList(object):
# read CSV file
for row in reader:
if row:
self.venues.append(
Venue(row[venue_index], row[year_index], row[identifier_index])
)
self.venues.append(Venue(row[venue_index],
row[year_index],
row[identifier_index]))
else:
raise IllegalArgumentError("Wrong CSV format.")
......@@ -55,10 +56,27 @@ class VenueList(object):
for venue in self.venues:
venue.retrieve_papers()
def retrieve_bibtex_entries(self):
for venue in self.venues:
venue.retrieve_venue_bibtex()
def validate_page_ranges(self):
for venue in self.venues:
venue.validate_page_ranges()
def write_to_bibtex(self, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# create a separate bibtex per venue
for venue in self.venues:
bib_file_path = os.path.join(output_dir, venue.name+".bib")
with codecs.open(bib_file_path, 'w', encoding='utf8') as bfp:
logger.info('Exporting bibtex entries to ' +
bib_file_path + '')
for bib_entry in venue.bib_entries:
bfp.writelines(bib_entry)
def write_to_csv(self, output_dir, delimiter):
"""
Export papers retrieved from venues to a CSV file.
......@@ -75,7 +93,8 @@ class VenueList(object):
file_path = os.path.join(output_dir, self.filename)
# write paper list to UTF8-encoded CSV file (see also http://stackoverflow.com/a/844443)
# write paper list to UTF8-encoded CSV file (see also
# http://stackoverflow.com/a/844443)
with codecs.open(file_path, 'w', encoding='utf8') as fp:
logger.info('Exporting papers to ' + file_path + '...')
writer = csv.writer(fp, delimiter=delimiter)
......@@ -94,10 +113,12 @@ class VenueList(object):
count = count + 1
else:
raise IllegalArgumentError(
str(len(column_names) - len(row)) + " parameter(s) is/are missing for venue "
+ venue.identifier)
str(len(column_names) - len(row)) +
" parameter(s) is/are missing for venue " +
venue.identifier)
except UnicodeEncodeError:
logger.error("Encoding error while writing data for venue: " + venue.identifier)
logger.error("Encoding error while writing venue:" +
venue.identifier)
logger.info(str(count) + ' papers have been exported.')
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment