// oclc open worldcat javascript scraper // $Id: $ , eric miller, em@potlach.org // // still to-do: // // - remodel data based on RDF FRBR; worldcat data models well into // work, expression, manifestation, item (but its unclear the HTML // data that is presented is detailed enough to mine effectively) // // - only gets first page (10 max) nearest libraries var RDFNS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; var DCNS = "http://purl.org/dc/elements/1.1/"; var BOOKNS = "http://example.com/vocab#"; var LCSHNS = "http://loc.gov/lcsh/"; var LOCNS = "http://simile.mit.edu/2005/05/ontologies/location#"; var onAddressLookedUp = function (itemURI, latlong) { model.addStatement(itemURI, LOCNS + "coordinates", latlong, true); } var geoHelper = utilities.createGeoHelper(onAddressLookedUp, utilities); var scrapeAPage = function(doc, fDonePageCont) { if (doc.location.href.indexOf("&zip=") >=0) { // scrape details associated with holdings of worldcat record scrapeRecordHoldings(doc); } else if (doc.location.href.indexOf("&tab=details") >=0) { // scrape details associated with worldcat record scrapeRecordDetails(doc); } else if (doc.location.href.indexOf("&tab=subjects") >=0) { // scrape subjects associated with worldcat record scrapeRecordSubjects(doc); } else { // scrape worldcat record scrapeRecord(doc); } // You must call this function before returning // so that the rest of the process can complete fDonePageCont(); } var scrapeRecord = function (doc) { // this function scrapes the details page associated with a worldcat record // title try { var dc_title_xpath = '//DIV/TABLE/TBODY/TR/TD/DIV/TABLE/TBODY/TR/TD/H1/A'; var dc_title = utilities.gatherElementsOnXPath(doc, doc, dc_title_xpath)[0].innerHTML; model.addStatement(uri, DCNS + "title", utilities.trimString(dc_title), true); } catch (e) { // no title availiable, do nothing } // publisher try { var dc_publisher_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/TABLE/TBODY/TR/TD/UL/LI[@class='publisher']"; var dc_publisher_t = utilities.gatherElementsOnXPath(doc, doc, dc_publisher_xpath)[0].innerHTML; var dc_publisher = dc_publisher_t.replace(/\Publisher: <\/strong>/g, ""); model.addStatement(uri, DCNS + "publisher", utilities.trimString(dc_publisher), true); } catch (e) { // no publishe availiable, do nothing } // author try { var dc_creators_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/TABLE/TBODY/TR/TD/P/STRONG/A"; var dc_creators = utilities.gatherElementsOnXPath(doc, doc, dc_creators_xpath); var j = 0; for (var i = 0; i < dc_creators.length; i++) { j = i + 1; try { var dc_creator_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/TABLE/TBODY/TR/TD/P/STRONG/A[" + j + "]"; var dc_creator = utilities.gatherElementsOnXPath(doc, doc, dc_creator_xpath)[0].innerHTML; model.addStatement(uri, DCNS + "creator", utilities.trimString(dc_creator), true); } catch (e) { // problem getting a specific creator... do nothing } } } catch (e) { // no author availiable, do nothing } // isbn try { var isbn_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/TABLE/TBODY/TR/TD/UL/LI[@class='isbn']"; var isbn_t = utilities.gatherElementsOnXPath(doc, doc, isbn_xpath)[0].innerHTML; var isbn = isbn_t.replace(/\ISBN: \<\/strong\>/g, ""); model.addStatement(uri, BOOKNS + "isbn", utilities.trimString(isbn), true); } catch (e) { // no ISBN availiable, do nothing } } var scrapeRecordHoldings = function (doc) { // this function scrapes the holdings nearest-you associated with a worldcat record var huri = doc.location.href; var library_row_xpath = '//div[@id="page"]/table[@class="tableLayout"]/tbody/tr/td[1][@class="content"]/table[@class="tableResults"]/tbody/tr'; var library_row = utilities.gatherElementsOnXPath(doc, doc, library_row_xpath); var row = 0; // utilities.debugPrint("library row length:" + library_row.length); for (var i = 0; i < library_row.length; i++) { try { // get library name row = i + 1; utilities.debugPrint("row: " + row); var library_xpath = ''; var library_loc_xpath = ''; library_xpath = '//div[@id="page"]/table[@class="tableLayout"]/tbody/tr/td[1][@class="content"]/table[@class="tableResults"]/tbody/tr[' + row + ']/td[2][@class="library"]/table[@class="tableLibrary"]/tbody/tr/td[1][@class="name"]/a/strong'; // var library_xpath = '//table[@class="tableResults"]/tr[' + row + ']/td/table[@class="tableLibrary"]/tbody/tr/td[@class="name"]/a/strong'; var library = utilities.gatherElementsOnXPath(doc, doc, library_xpath)[0].innerHTML; // utilities.debugPrint("row: " + row); var library_uri = "http://purl.org/tag/library/" + library.replace(/ /g, "_"); model.addStatement(library_uri, RDFNS + "type", BOOKNS + "Library", false); model.addStatement(library_uri, DCNS + "title", utilities.trimString(library), true); // get library location library_loc_xpath = '//div[@id="page"]/table[@class="tableLayout"]/tbody/tr/td[1][@class="content"]/table[@class="tableResults"]/tbody/tr[' + row + ']/td[2][@class="library"]/table[@class="tableLibrary"]/tbody/tr/td[2][@class="location"]'; // var library_loc_xpath = '//table[@class="tableResults"]/tr[' + row + ']/td/table[@class="tableLibrary"]/tbody/tr/td[@class="location"]'; var library_loc = utilities.gatherElementsOnXPath(doc, doc, library_loc_xpath)[0].innerHTML; // get the coordinates for the library // note: google maps look-up seems to get confused over city, state, zip // e.g 'Springfield, OH, 45505' entries... code is designed to provide a more deterministic input var address = utilities.trimString(library_loc).replace(/\ \;/g, " ").replace(/\n/g, ", "); var city = address.split(/, /)[0]; var state = address.split(/, /)[1]; var zip = address.split(/, /)[2]; model.addStatement(library_uri, LOCNS + "city", city, true); model.addStatement(library_uri, LOCNS + "state", state, true); model.addStatement(library_uri, LOCNS + "zip-code", zip, true); // add assertions to model geoHelper.add(library_uri, address); // indicate the library holds the book model.addStatement(library_uri, BOOKNS + "holds", uri, false); } catch (e) { // problem adding information based on scraped page utilities.debugPrint("error processing library location on geo page... entry number: " + row); utilities.debugPrint("library xpath: " + library_xpath); utilities.debugPrint("library location xpath: " + library_loc_xpath); } } } var scrapeRecordDetails = function (doc) { // this function scrapes the details page associated with a worldcat record var duri = doc.location.href; try { var xpath = '/HTML/BODY/DIV/TABLE/TBODY/TR/TD/BLOCKQUOTE/UL/LI/A'; var elements = utilities.gatherElementsOnXPath(doc, doc, xpath); for (var i = 0; i < elements.length; i++) { var element = elements[i].innerHTML; if (element.match("Genre/Form: ")) { var genre = element.replace(/\Genre\/Form: <\/strong>/g, ""); model.addStatement(uri, BOOKNS + "genre", utilities.trimString(genre), true); } else if (element.match("Named Person: ")) { var person = element.replace(/\Named Person: <\/strong>/g, ""); model.addStatement(uri, BOOKNS + "namedPerson", utilities.trimString(person), true); } else if (element.match("Material Type: ")) { var mt = element.replace(/\Material Type: <\/strong>/g, ""); model.addStatement(uri, BOOKNS + "materialType", utilities.trimString(mt), true); } else if (element.match("Document Type: ")) { var dt = element.replace(/\Document Type: <\/strong>/g, ""); model.addStatement(uri, RDFNS + "type", BOOKNS + utilities.trimString(dt), false); } } } catch (e) { // problem getting record details... do nothing } } var scrapeRecordSubjects = function (doc) { // this function scrapes the subjects page associated with a worldcat record var suri = doc.location.href; try { var xpath = '/HTML/BODY/DIV/TABLE/TBODY/TR/TD/BLOCKQUOTE/UL/LI/A'; var elements = utilities.gatherElementsOnXPath(doc, doc, xpath); for (var i = 0; i < elements.length; i++) { var element = elements[i].innerHTML; model.addStatement(uri, DCNS + "subject", utilities.trimString(element), true); } } catch (e) { // problem getting record subjects... do nothing } } //========================================================= var pageURLs = []; var gatherPageURLs = function(doc) { // This function should return an array of URLs (strings) // of other pages to scrape (e.g., subsequent search results // pages). It should not include the URL of the current // page. If there is no other page to scrape, you do not have // to change this function. // return []; var currentURL = doc.location.href; pageURLs.unshift(currentURL + "&tab=details"); pageURLs.unshift(currentURL + "&tab=subjects"); return pageURLs; } // the following code is used to create the URL for the holdings record nearest you var zip = 0; var zipin = browser.contentWindow.prompt("Enter your Zipcode to find libraries near you that have this resource: ", "", "Simile / Piggy Bank - OCLC Open Worldcat Scraper"); var holding_query_prefix = "http://www.worldcatlibraries.org/wcpa/servlet/WLAHoldingServlet?"; try { // need both 'query' and 'sessionid' try { //var query_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/FORM/INPUT[1][@value]"; // var query_xpath = "//DIV[@id='locator']/FORM/INPUT[@name='query'][@value]"; var query_xpath = "//INPUT[@name='query'][@value]"; var query = utilities.gatherElementsOnXPath(doc, doc, query_xpath)[0].value; utilities.debugPrint("query" + query); try { // var session_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/FORM/INPUT[2][@value]"; // var session_xpath = "//DIV[@id='locator']/FORM/INPUT[@name='sessionid'][@value]"; var session_xpath = "//INPUT[@name='sessionid'][@value]"; var session = utilities.gatherElementsOnXPath(doc, doc, session_xpath)[0].value; // construct holding url from all bits and add it to pages to harvest var holdings_url = holding_query_prefix + "query=" + query + "&sessionid=" + session + "&recno=1&zip=" + zipin; utilities.debugPrint("sessionid:" + session); utilities.debugPrint("zipin:" + zipin); utilities.debugPrint(holdings_url); // utilities.debugPrint(holdings_url); pageURLs.unshift(holdings_url); } catch (e) { // no session, do nothing } } catch (e) { // no query, do nothing } } catch (e) { // no zip code entered, do nothing } //========================================================= // The following code kickstarts the scraping process. var uri = doc.location.href; utilities.debugPrint("boo"); utilities.processDocuments(browser, doc, gatherPageURLs(doc), function(d, cont) { scrapeAPage(d, cont); }, function() { geoHelper.lookupAddresses(done,done); }, function(e, url) { alert("Error scraping data from " + url + "\n" + e); } ); // don't navigate to collected data just yet wait();