Commit f4029536 authored by Sebastian Baltes's avatar Sebastian Baltes
Browse files

Add validation of page ranges

parent 7e0d59fb
# dblp-retriever
Retrieve paper metadata from conference proceedings and journals indexed in DBLP.
Currently, retrieval of the following properties is suppoerted:
Currently, retrieval of the following properties is supported:
* paper title
* authors
......@@ -10,6 +10,7 @@ Currently, retrieval of the following properties is suppoerted:
* paper length
* link to electronic edition of paper
The tool validates the page ranges and adds a log message to column `comment` in case possible inconsistencies are detected.
Tested with ICSE, FSE, TSE, and TOSEM 2014-2018.
# Setup
......
import argparse
import logging
# get global logger
from dblp.venue import Venue
from dblp.venue_list import VenueList
logger = logging.getLogger('dblp-retriever_logger')
......@@ -39,14 +37,11 @@ def main():
parser = get_argument_parser()
args = parser.parse_args()
# read venue identifiers from CSV
# process venues
venue_list = VenueList()
venue_list.read_from_csv(args.input_file, args.delimiter)
# retrieve papers
venue_list.retrieve_papers()
# write entities to CSV file
venue_list.validate_page_ranges()
venue_list.write_to_csv(args.output_dir, args.delimiter)
......
import logging
from util.regex import REGULAR_PAGE_RANGE_REGEX, NUMBERED_PAGE_RANGE_REGEX
logger = logging.getLogger("dblp-retriever_logger")
class Paper(object):
""" DBLP paper. """
""" Paper metadata from DBLP. """
def __init__(self, venue, year, identifier, heading, title, authors, pages, electronic_edition):
def __init__(self, venue, year, identifier, heading, title, authors, page_range, electronic_edition):
self.venue = venue
self.year = year
self.identifier = identifier
self.heading = heading
self.title = title
self.authors = authors
self.pages = pages
self.length = 0
self.page_range = page_range
self.article_number = -1
self.first_page = -1
self.last_page = -1
self.length = -1
self.electronic_edition = electronic_edition
self.comment = ""
self.regular_page_range = REGULAR_PAGE_RANGE_REGEX.fullmatch(page_range)
self.numbered_page_range = NUMBERED_PAGE_RANGE_REGEX.fullmatch(page_range)
# determine paper length
page_range = self.pages.split("-")
if len(page_range) == 1:
self.length = 1
elif len(page_range) == 2:
begin_page = page_range[0].split(":")
end_page = page_range[1].split(":")
if len(begin_page) == 1:
self.length = int(end_page[0]) - int(begin_page[0]) + 1
elif len(begin_page) == 2:
self.length = int(end_page[1]) - int(begin_page[1]) + 1 # numbered articles, see, e.g., TOSEM
if page_range == "":
# empty page range
self.first_page = -1
self.last_page = -1
self.length = 0
self.append_comment("empty_page_range")
logger.warning("Empty page range for paper " + str(self))
elif self.regular_page_range:
page_range = Paper.split_page_range(self.page_range)
if len(page_range) == 1:
# only one page, e.g. "5"
self.first_page = int(page_range[0])
self.last_page = int(page_range[0])
self.length = 1
elif len(page_range) == 2:
# regular page range, e.g. "60-71"
self.first_page = int(page_range[0])
self.last_page = int(page_range[1])
self.length = self.last_page - self.first_page + 1
elif self.numbered_page_range:
page_range = Paper.split_numbered_page_range(self.page_range)
if len(page_range) == 2:
# only one page, e.g. "27:1"
self.article_number = int(page_range[0])
self.first_page = int(page_range[1])
self.last_page = int(page_range[1])
self.length = 1
elif len(page_range) == 4:
# numbered article page range, e.g., "18:1-18:33"
self.article_number = int(page_range[0])
self.first_page = int(page_range[1])
self.last_page = int(page_range[3])
self.length = self.last_page - self.first_page + 1
def append_comment(self, comment):
if self.comment == "":
self.comment = comment
else:
self.comment = self.comment + ";" + comment
def __str__(self):
return str(self.electronic_edition)
def get_column_values(self):
return [self.venue, self.year, self.identifier, self.heading, self.title, self.authors, self.page_range,
self.length, self.electronic_edition, self.comment]
@classmethod
def get_column_names(cls):
return ["venue", "year", "identifier", "heading", "title", "authors", "pages", "length", "electronic_edition"]
return ["venue", "year", "identifier", "heading", "title", "authors", "page_range", "length",
"electronic_edition", "comment"]
def get_column_values(self):
return [self.venue, self.year, self.identifier, self.heading, self.title, self.authors, self.pages, self.length, self.electronic_edition]
\ No newline at end of file
@classmethod
def split_page_range(cls, page_range):
return str(page_range).split("-")
@classmethod
def split_numbered_page_range(cls, numbered_page_range):
page_range = Paper.split_page_range(numbered_page_range)
fragments = []
for page in page_range:
fragments = fragments + str(page).split(":")
return fragments
import logging
# get root logger
import requests
from lxml import html
from lxml import html
from dblp.paper import Paper
logger = logging.getLogger("dblp-retriever_logger")
class Venue(object):
""" DBLP venue. """
""" A venue on DBLP. """
def __init__(self, name, year, identifier):
self.name = str(name)
self.year = str(year)
self.identifier = str(identifier)
self.uri = "https://dblp.org/db/" + self.identifier + ".html"
self.papers = []
......@@ -29,7 +28,7 @@ class Venue(object):
response = self.session.get(self.uri)
if response.ok:
logger.info("Successfully retrieved TOC of venue: " + self.identifier)
logger.info("Successfully retrieved TOC of venue: " + str(self))
tree = html.fromstring(response.content)
items = tree.xpath('//header[not(@class)]/h2 | //header[not(@class)]/h3 | //ul[@class="publ-list"]/li')
......@@ -85,15 +84,50 @@ class Venue(object):
ee
))
logger.info("Successfully parsed TOC of venue: " + self.identifier)
logger.info("Successfully parsed TOC of venue: " + str(self))
else:
logger.error("An error occurred while retrieving TOC of venue: " + self.identifier)
logger.error("An error occurred while retrieving TOC of venue: " + str(self))
except ConnectionError:
logger.error("An error occurred while retrieving TOC of venue: " + self.identifier)
logger.error("An error occurred while retrieving TOC of venue: " + str(self))
def validate_page_ranges(self):
logger.info("Sorting papers of venue: " + str(self))
self.papers.sort(key=lambda p: p.first_page)
logger.info("Validating page ranges of venue: " + str(self))
if len(self.papers) < 2:
return
previous_paper = self.papers[0]
for i in range(1, len(self.papers)):
current_paper = self.papers[i]
if current_paper.page_range == "" or previous_paper.page_range == "":
previous_paper = self.papers[i]
continue
if current_paper.regular_page_range and current_paper.first_page != previous_paper.last_page + 1:
current_paper.append_comment("issue_first_page")
previous_paper.append_comment("issue_last_page")
logger.warning("First page of paper " + str(current_paper) + " does not match previous paper "
+ str(previous_paper))
elif current_paper.numbered_page_range and current_paper.article_number != previous_paper.article_number + 1:
current_paper.append_comment("issue_article_number")
previous_paper.append_comment("issue_article_number")
logger.warning("Article number of paper " + str(current_paper) + " does not match previous paper "
+ str(previous_paper))
previous_paper = self.papers[i]
def get_rows(self):
rows = []
for paper in self.papers:
rows.append(paper.get_column_values())
return rows
def __str__(self):
return str(self.identifier)
import codecs
import csv
import logging
# get root logger
import os
from dblp import paper
from dblp.paper import Paper
from dblp.venue import Venue
from util.exceptions import IllegalArgumentError
......@@ -45,7 +42,9 @@ class VenueList(object):
# read CSV file
for row in reader:
if row:
self.venues.append(Venue(row[venue_index], row[year_index], row[identifier_index]))
self.venues.append(
Venue(row[venue_index], row[year_index], row[identifier_index])
)
else:
raise IllegalArgumentError("Wrong CSV format.")
......@@ -56,6 +55,10 @@ class VenueList(object):
for venue in self.venues:
venue.retrieve_papers()
def validate_page_ranges(self):
for venue in self.venues:
venue.validate_page_ranges()
def write_to_csv(self, output_dir, delimiter):
"""
Export papers retrieved from venues to a CSV file.
......
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
""" Collection of regular expressions. """
import re
# regular expressions for page ranges
REGULAR_PAGE_RANGE_REGEX = re.compile(r'\d+(-\d+)?')
NUMBERED_PAGE_RANGE_REGEX = re.compile(r'\d+:\d+(-\d+:\d+)?')
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment