Commit b6045880 authored by gorla's avatar gorla
Browse files

retrieve abstract function added. Works only for IEEE and ACM

parent f1da2904
Pipeline #11876 failed with stages
in 0 seconds
import bibtexparser
import logging
import re
import requests
import time
import json
from lxml import html
from dblp.paper import Paper
......@@ -26,11 +27,52 @@ class Venue(object):
# session for data retrieval
self.session = requests.Session()
def getAbstract(self, DOIlink):
abstract = ""
responseDOI = self.session.get(DOIlink)
# if too many requests, wait and make another request
if responseDOI.status_code == 429:
time.sleep(int(responseDOI.headers["Retry-After"])+2)
responseDOI = self.session.get(DOIlink)
if responseDOI.ok:
logger.info("Succesfully retrieved DOI page " + DOIlink)
treeDOI = html.fromstring(responseDOI.content.decode('utf-8'))
head_metadata = treeDOI.xpath('//head/meta[@content]')
if len(head_metadata) == 0:
logger.warn("Could not get head metadata while retrieving the DOI link")
return abstract
# ACM DL
if "ACM" in head_metadata[0].attrib.get("content"):
abstract_paragraphs = treeDOI.xpath('//div[@class="abstractSection abstractInFull"]/*')
for par in abstract_paragraphs:
if par.tag == "p":
for t in par.xpath('text()'):
abstract += t
return abstract
# IEEE
if "IEEE" in head_metadata[0].attrib.get("content"):
# I wish this was proper json.. it is not really
json_content = re.search('xplGlobal.document.metadata=(.*)};.*</script>.*<div class="ng2-app"', str(responseDOI.content))
json_content = json_content.group(1)+"}".strip()
abstract = re.search('"abstract":"true",.*"abstract":"(.*?)",".*"abstract":"true"', json_content)
if abstract is None:
return ""
else:
return abstract.group(1)
return abstract
else:
return ""
return ""
def retrieve_papers(self):
try:
# retrieve data
response = self.session.get(self.uri)
# if too many requests, wait and make another request
if response.status_code == 429:
time.sleep(int(response.headers["Retry-After"])+2)
response = self.session.get(self.uri)
if response.ok:
logger.info("Successfully retrieved TOC of venue: "
+ str(self))
......@@ -39,7 +81,7 @@ class Venue(object):
items = tree.xpath('//header[not(@class)]/h2 | //header[not(@class)]/h3 | //ul[@class="publ-list"]/li')
current_heading = ""
##FIXME some venues do not have sessions (e.g. SP2020 or SP2013). This makes this function retrieve 0 papers.
for item in items:
if item.tag == "h2" or item.tag == "h3":
heading = item.xpath("descendant-or-self::*/text()")
......@@ -78,16 +120,8 @@ class Venue(object):
doi_link = [link for link in ee if "doi.org" in link]
if len(doi_link) > 0:
ee = str(doi_link[0])
## FIXME if abstract option is true. get abstract
responseDOI = self.session.get(ee)
if responseDOI.ok:
logger.info("Succesfully retrieved DOI page " + ee)
treeDOI = html.fromstring(responseDOI.content)
## ACM DL
abstract_paragraphs = treeDOI.xpath('//div[@class="abstractSection abstractInFull"]/*')
for par in abstract_paragraphs:
if par.tag == "p":
abstract += (par.xpath('text()')[0])
# if withAbstract:
abstract = self.getAbstract(ee)
else:
ee = str(ee[0])
......@@ -117,7 +151,7 @@ class Venue(object):
logger.info("Successfully parsed TOC of venue: " + str(self))
else:
logger.error("An error occurred while retrieving TOC of venue: "
+ str(self))
+ str(self.uri))
except ConnectionError:
logger.error("An error occurred while retrieving TOC of venue: "
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment