Extracting Abstracts For an AGU Session

Extracting Abstracts For an AGU Session#

We can extract titles and requested presentation formats from the AGU web page for a session as follows:

import requests
from lxml import html

url = "https://agu.confex.com/agu/fm21/h/sessions/viewsession.cgi"
params = {
    "RecordType": "Session",
    "Recordid": "125520",
    "Field0Name": "password",
    "Field0Value": "*cookie",
    "Hash": "0fb450efc41f35c181addb926543f8d8",
}
r = requests.get(url, params=params)

root = html.fromstring(r.text)

xpath = '//div/div/b[contains(text(),"Requested Presentation Format")]/parent::*/text()'
formats = [x.strip() for x in root.xpath(xpath)]

xpath = '//div/div/b[contains(text(),"Abstract ID#")]/parent::*/text()'
titles = [x.strip() for x in root.xpath(xpath)]

abstracts = pd.DataFrame({"Titles": titles, "Format": formats})