Commit 032fd05e authored by Sebastian Baltes's avatar Sebastian Baltes
Browse files

Initial commit

parent ffe92f5c
import argparse
import logging
# get global logger
from dblp.venue import Venue
from dblp.venue_list import VenueList
logger = logging.getLogger('dblp-retriever_logger')
def get_argument_parser():
arg_parser = argparse.ArgumentParser(
description='Retrieve paper metadata from DBLP,'
)
arg_parser.add_argument(
'-i', '--input-file',
required=True,
help='CSV file with venue identifiers.',
dest='input_file'
)
arg_parser.add_argument(
'-o', '--output-dir',
required=True,
help='Path to output directory',
dest='output_dir'
)
arg_parser.add_argument(
'-d', '--delimiter',
required=False,
default=',',
help='delimiter for CSV files (default: \',\')',
dest='delimiter'
)
return arg_parser
def main():
# parse command line arguments
parser = get_argument_parser()
args = parser.parse_args()
# read venue identifiers from CSV
venue_list = VenueList()
venue_list.read_from_csv(args.input_file, args.delimiter)
# retrieve papers
venue_list.retrieve_papers()
# write entities to CSV file
venue_list.write_to_csv(args.output_dir, args.delimiter)
if __name__ == '__main__':
main()
import util.log
LOG_FILE = 'dblp-retriever.log'
# initialize named global logger
logger = util.log.configure_logger('dblp-retriever_logger', LOG_FILE)
class Paper(object):
""" DBLP paper. """
def __init__(self, venue, year, identifier, heading, title, authors, pages, electronic_edition):
self.venue = venue
self.year = year
self.identifier = identifier
self.heading = heading
self.title = title
self.authors = authors
self.pages = pages
self.length = 0
self.electronic_edition = electronic_edition
# determine paper length
page_range = self.pages.split("-")
if len(page_range) == 1:
self.length = 1
elif len(page_range) == 2:
begin_page = page_range[0].split(":")
end_page = page_range[1].split(":")
if len(begin_page) == 1:
self.length = int(end_page[0]) - int(begin_page[0]) + 1
elif len(begin_page) == 2:
self.length = int(end_page[1]) - int(begin_page[1]) + 1 # numbered articles, see, e.g., TOSEM
@classmethod
def get_column_names(cls):
return ["venue", "year", "identifier", "heading", "title", "authors", "pages", "length", "electronic_edition"]
def get_column_values(self):
return [self.venue, self.year, self.identifier, self.heading, self.title, self.authors, self.pages, self.length, self.electronic_edition]
\ No newline at end of file
import logging
# get root logger
import requests
from lxml import html
from dblp.paper import Paper
logger = logging.getLogger("dblp-retriever_logger")
class Venue(object):
""" DBLP venue. """
def __init__(self, name, year, identifier):
self.name = str(name)
self.year = str(year)
self.identifier = str(identifier)
self.uri = "https://dblp.org/db/" + self.identifier + ".html"
self.papers = []
# session for data retrieval
self.session = requests.Session()
def retrieve_papers(self):
try:
# retrieve data
response = self.session.get(self.uri)
if response.ok:
logger.info("Successfully retrieved TOC of venue: " + self.identifier)
tree = html.fromstring(response.content)
items = tree.xpath('//header[not(@class)]/h2 | //header[not(@class)]/h3 | //ul[@class="publ-list"]/li')
current_heading = ""
year = ""
for item in items:
if item.tag == "h2" or item.tag == "h3":
current_heading = item.text
elif item.tag == "li":
if current_heading == "":
# the following only works for conferences, not for journals
# year = item.xpath('div[@class="data"]/span[@itemprop="datePublished"]/text()')
# if len(year) > 0:
# year = str(year[0])
# else:
# year = ""
continue
title = item.xpath('div[@class="data"]/span[@itemprop="name"]/text()')
if len(title) > 0:
title = str(title[0])
else:
title = ""
pages = item.xpath('div[@class="data"]/span[@itemprop="pagination"]/text()')
if len(pages) > 0:
pages = str(pages[0])
else:
pages = ""
ee = item.xpath('nav[@class="publ"]/ul/li[@class="drop-down"]/div[@class="head"]/a/@href')
if len(ee) > 0:
ee = str(ee[0])
else:
ee = ""
authors = item.xpath('div[@class="data"]/span[@itemprop="author"]/a/span[@itemprop="name"]/text()')
if len(authors) == 1:
authors = str(authors[0])
else:
authors = "; ".join(authors)
self.papers.append(Paper(
self.name,
self.year,
self.identifier,
current_heading,
title,
authors,
pages,
ee
))
logger.info("Successfully parsed TOC of venue: " + self.identifier)
else:
logger.error("An error occurred while retrieving TOC of venue: " + self.identifier)
except ConnectionError:
logger.error("An error occurred while retrieving TOC of venue: " + self.identifier)
def get_rows(self):
rows = []
for paper in self.papers:
rows.append(paper.get_column_values())
return rows
import codecs
import csv
import logging
# get root logger
import os
from dblp import paper
from dblp.paper import Paper
from dblp.venue import Venue
from util.exceptions import IllegalArgumentError
logger = logging.getLogger("dblp-retriever_logger")
class VenueList(object):
""" List of DBLP venues. """
def __init__(self):
self.filename = ""
self.venues = []
def read_from_csv(self, input_file, delimiter):
"""
Read venues from a CSV file (header required).
:param input_file: Path to the CSV file.
:param delimiter: Column delimiter in CSV file (typically ',').
"""
# read CSV as UTF-8 encoded file (see also http://stackoverflow.com/a/844443)
with codecs.open(input_file, encoding='utf8') as fp:
logger.info("Reading venues from " + input_file + "...")
reader = csv.reader(fp, delimiter=delimiter)
# read header
header = next(reader, None)
if not header:
raise IllegalArgumentError("Missing header in CSV file.")
venue_index = header.index("venue")
year_index = header.index("year")
identifier_index = header.index("identifier")
# read CSV file
for row in reader:
if row:
self.venues.append(Venue(row[venue_index], row[year_index], row[identifier_index]))
else:
raise IllegalArgumentError("Wrong CSV format.")
self.filename = os.path.basename(input_file)
logger.info(str(len(self.venues)) + " venues have been imported.")
def retrieve_papers(self):
for venue in self.venues:
venue.retrieve_papers()
def write_to_csv(self, output_dir, delimiter):
"""
Export papers retrieved from venues to a CSV file.
:param output_dir: Target directory for generated CSV file.
:param delimiter: Column delimiter in CSV file (typically ',').
"""
if len(self.venues) == 0:
logger.info("Nothing to export.")
return
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_path = os.path.join(output_dir, self.filename)
# write paper list to UTF8-encoded CSV file (see also http://stackoverflow.com/a/844443)
with codecs.open(file_path, 'w', encoding='utf8') as fp:
logger.info('Exporting papers to ' + file_path + '...')
writer = csv.writer(fp, delimiter=delimiter)
column_names = Paper.get_column_names()
# write header of CSV file
writer.writerow(column_names)
count = 0
for venue in self.venues:
try:
for row in venue.get_rows():
if len(row) == len(column_names):
writer.writerow(row)
count = count + 1
else:
raise IllegalArgumentError(
str(len(column_names) - len(row)) + " parameter(s) is/are missing for venue "
+ venue.identifier)
except UnicodeEncodeError:
logger.error("Encoding error while writing data for venue: " + venue.identifier)
logger.info(str(count) + ' papers have been exported.')
""" Custom errors and exceptions. """
class IllegalArgumentError(Exception):
"""
Raised if wrong argument values are passed to functions.
(Similar to ValueError for build-in functions.)
"""
pass
class IllegalStateError(Exception):
"""
Raised if application is in an illegal state.
(e.g., values not initialized, functions not called in intended order, etc.)
"""
pass
class IllegalConfigurationError(Exception):
"""
Raised if an illegal configuration is provided/parsed.
(e.g., fields missing)
"""
pass
""" Global logger. """
import logging
def configure_logger(name, log_file):
"""
Configure a named global logger.
(see also [1])
:param name: Name of global logger.
:param log_file: Path to log file for FileHandler.
[1]: http://stackoverflow.com/a/7622029
"""
logger = logging.getLogger(name) # name is None => returns root logger
log_formatter = logging.Formatter(fmt='%(asctime)s %(name)s %(levelname)s: %(message)s')
# write log messages to console
console_handler = logging.StreamHandler()
console_handler.setFormatter(log_formatter)
console_handler.setLevel(logging.INFO)
# write log messages to log file
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(log_formatter)
file_handler.setLevel(logging.DEBUG)
logger.setLevel(logging.DEBUG)
logger.addHandler(console_handler)
logger.addHandler(file_handler)
return logger
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment