// oclc open worldcat javascript scraper
// $Id: $ , eric miller, em@potlach.org
//
// still to-do: 
// 
// - remodel data based on RDF FRBR; worldcat data models well into
//   work, expression, manifestation, item (but its unclear the HTML
//   data that is presented is detailed enough to mine effectively)
// 
// - only gets first page (10 max) nearest libraries

var RDFNS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
var DCNS = "http://purl.org/dc/elements/1.1/";
var BOOKNS = "http://example.com/vocab#";
var LCSHNS = "http://loc.gov/lcsh/";
var LOCNS = "http://simile.mit.edu/2005/05/ontologies/location#";


var onAddressLookedUp = function (itemURI, latlong) {

  model.addStatement(itemURI, LOCNS + "coordinates", latlong, true);
}

var geoHelper = utilities.createGeoHelper(onAddressLookedUp, utilities);

var scrapeAPage = function(doc, fDonePageCont) {
    
  if (doc.location.href.indexOf("&zip=") >=0) { 
    
    // scrape details associated with holdings of worldcat record
    
    scrapeRecordHoldings(doc); 
    
  } else if (doc.location.href.indexOf("&tab=details") >=0) { 
    
    // scrape details associated with worldcat record
    
    scrapeRecordDetails(doc); 
    
  } else if  (doc.location.href.indexOf("&tab=subjects") >=0) {
    
    // scrape subjects associated with worldcat record
    
    scrapeRecordSubjects(doc);
    
  } else {
    
    // scrape worldcat record
    
    scrapeRecord(doc);
  }
  
  // You must call this function before returning
  // so that the rest of the process can complete
  
  fDonePageCont();
}


var scrapeRecord = function (doc) {
    
  // this function scrapes the details page associated with a worldcat record

  // title
  
  try {
    var dc_title_xpath = '//DIV/TABLE/TBODY/TR/TD/DIV/TABLE/TBODY/TR/TD/H1/A';
    var dc_title = utilities.gatherElementsOnXPath(doc, doc, dc_title_xpath)[0].innerHTML;
    model.addStatement(uri, DCNS + "title", utilities.trimString(dc_title), true);
  } catch (e) {
    // no title availiable, do nothing
  }

  // publisher
  
  try {
    var dc_publisher_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/TABLE/TBODY/TR/TD/UL/LI[@class='publisher']";
    var dc_publisher_t = utilities.gatherElementsOnXPath(doc, doc, dc_publisher_xpath)[0].innerHTML;
    var dc_publisher = dc_publisher_t.replace(/\<strong>Publisher: <\/strong>/g, "");  
    model.addStatement(uri, DCNS + "publisher", utilities.trimString(dc_publisher), true);
  } catch (e) {
    // no publishe availiable, do nothing
  }

  // author

  try {
    var dc_creators_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/TABLE/TBODY/TR/TD/P/STRONG/A";
    var dc_creators = utilities.gatherElementsOnXPath(doc, doc, dc_creators_xpath);

    var j = 0;

    for (var i = 0; i < dc_creators.length; i++) {
	
      j = i + 1;
	
      try {
	var dc_creator_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/TABLE/TBODY/TR/TD/P/STRONG/A[" + j + "]";
	var dc_creator = utilities.gatherElementsOnXPath(doc, doc, dc_creator_xpath)[0].innerHTML;
	model.addStatement(uri, DCNS + "creator", utilities.trimString(dc_creator),  true);
      } catch (e) {
	// problem getting a specific creator... do nothing
      }
    }	

  } catch (e) {
    // no author availiable, do nothing
  }

  // isbn
  
  try {
    var isbn_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/TABLE/TBODY/TR/TD/UL/LI[@class='isbn']";
    var isbn_t = utilities.gatherElementsOnXPath(doc, doc, isbn_xpath)[0].innerHTML;
    var isbn = isbn_t.replace(/\<strong\>ISBN: \<\/strong\>/g, "");  
    model.addStatement(uri, BOOKNS + "isbn",  utilities.trimString(isbn), true);
  } catch (e) {
    // no ISBN availiable, do nothing 
  }

}

    
    
var scrapeRecordHoldings = function (doc) {

  // this function scrapes the holdings nearest-you associated with a worldcat record

  var huri = doc.location.href;

  var library_row_xpath = '//div[@id="page"]/table[@class="tableLayout"]/tbody/tr/td[1][@class="content"]/table[@class="tableResults"]/tbody/tr';

  var library_row = utilities.gatherElementsOnXPath(doc, doc, library_row_xpath);

  var row = 0;

  // utilities.debugPrint("library row length:" + library_row.length);

  for (var i = 0; i < library_row.length; i++) {
      

    try {

	// get library name

	row = i + 1;
      	utilities.debugPrint("row: " + row);

	var library_xpath = '';
	var library_loc_xpath = '';

	library_xpath = '//div[@id="page"]/table[@class="tableLayout"]/tbody/tr/td[1][@class="content"]/table[@class="tableResults"]/tbody/tr[' + row + ']/td[2][@class="library"]/table[@class="tableLibrary"]/tbody/tr/td[1][@class="name"]/a/strong';

	// var library_xpath = '//table[@class="tableResults"]/tr[' + row + ']/td/table[@class="tableLibrary"]/tbody/tr/td[@class="name"]/a/strong';
	
	var library = utilities.gatherElementsOnXPath(doc, doc, library_xpath)[0].innerHTML;
	
	// utilities.debugPrint("row: " + row);
	
	var library_uri = "http://purl.org/tag/library/" + library.replace(/ /g, "_");	
	
	model.addStatement(library_uri, RDFNS + "type", BOOKNS + "Library", false);
	model.addStatement(library_uri, DCNS + "title", utilities.trimString(library), true);
	
	// get library location
	
	library_loc_xpath = '//div[@id="page"]/table[@class="tableLayout"]/tbody/tr/td[1][@class="content"]/table[@class="tableResults"]/tbody/tr[' + row + ']/td[2][@class="library"]/table[@class="tableLibrary"]/tbody/tr/td[2][@class="location"]';

	// var library_loc_xpath = '//table[@class="tableResults"]/tr[' + row + ']/td/table[@class="tableLibrary"]/tbody/tr/td[@class="location"]';

	var library_loc = utilities.gatherElementsOnXPath(doc, doc, library_loc_xpath)[0].innerHTML;
	
	// get the coordinates for the library
	// note: google maps look-up seems to get confused over city, state, zip
	// e.g 'Springfield, OH, 45505' entries... code is designed to provide a more deterministic input
	
	var address = utilities.trimString(library_loc).replace(/\&nbsp\;/g, " ").replace(/\n/g, ", ");	
	var city = address.split(/, /)[0];
	var state = address.split(/, /)[1];
	var zip = address.split(/, /)[2];
	
	model.addStatement(library_uri, LOCNS + "city", city, true);
	model.addStatement(library_uri, LOCNS + "state", state, true);
	model.addStatement(library_uri, LOCNS + "zip-code", zip, true);
	
	// add assertions to model 
	
	geoHelper.add(library_uri, address);
	
	// indicate the library holds the book
	
	model.addStatement(library_uri, BOOKNS + "holds", uri, false);

    } catch (e) {

	// problem adding information based on scraped page

	utilities.debugPrint("error processing library location on geo page... entry number: " + row);
	utilities.debugPrint("library xpath: " + library_xpath);
	utilities.debugPrint("library location xpath: " + library_loc_xpath);
    }
  }
}

var scrapeRecordDetails = function (doc) {
    
  // this function scrapes the details page associated with a worldcat record

  var duri = doc.location.href;

  try {
    var xpath = '/HTML/BODY/DIV/TABLE/TBODY/TR/TD/BLOCKQUOTE/UL/LI/A';
    var elements = utilities.gatherElementsOnXPath(doc, doc, xpath);

    for (var i = 0; i < elements.length; i++) {
      
      var element = elements[i].innerHTML;
      
      if (element.match("<strong>Genre/Form: </strong>")) {
	
	var genre = element.replace(/\<strong>Genre\/Form: <\/strong>/g, "");  
	model.addStatement(uri, BOOKNS + "genre", utilities.trimString(genre), true);
	
      } else if (element.match("<strong>Named Person: </strong>")) {
	
	var person = element.replace(/\<strong>Named Person: <\/strong>/g, "");  
	model.addStatement(uri, BOOKNS + "namedPerson", utilities.trimString(person), true);
	
      } else if (element.match("<strong>Material Type: </strong>")) {
	
	var mt = element.replace(/\<strong>Material Type: <\/strong>/g, "");  
	model.addStatement(uri, BOOKNS + "materialType", utilities.trimString(mt), true);
	
      } else if (element.match("<strong>Document Type: </strong>")) {
	
	var dt = element.replace(/\<strong>Document Type: <\/strong>/g, "");  
 	model.addStatement(uri, RDFNS + "type", BOOKNS + utilities.trimString(dt), false);
      }
      
    }
  } catch (e) {
    // problem getting record details... do nothing
  }

}


var scrapeRecordSubjects = function (doc) {
    
  // this function scrapes the subjects page associated with a worldcat record

  var suri = doc.location.href;

  try {
    var xpath = '/HTML/BODY/DIV/TABLE/TBODY/TR/TD/BLOCKQUOTE/UL/LI/A';
    var elements = utilities.gatherElementsOnXPath(doc, doc, xpath);
    
    for (var i = 0; i < elements.length; i++) {      
      var element = elements[i].innerHTML;
      model.addStatement(uri, DCNS + "subject", utilities.trimString(element), true);      
    }
  } catch (e) {
    // problem getting record subjects... do nothing
  }
    
}


//=========================================================

var pageURLs = [];

var gatherPageURLs = function(doc) {

  // This function should return an array of URLs (strings)
  // of other pages to scrape (e.g., subsequent search results
  // pages). It should not include the URL of the current
  // page. If there is no other page to scrape, you do not have
  // to change this function.

  // return [];

  var currentURL = doc.location.href;
    
  pageURLs.unshift(currentURL + "&tab=details");
  pageURLs.unshift(currentURL + "&tab=subjects");

  return pageURLs;
}
  

// the following code is used to create the URL for the holdings record nearest you

var zip = 0;
var zipin = browser.contentWindow.prompt("Enter your Zipcode to find libraries near you that have this resource: ", "", "Simile / Piggy Bank - OCLC Open Worldcat Scraper");

var holding_query_prefix = "http://www.worldcatlibraries.org/wcpa/servlet/WLAHoldingServlet?";

try {
  // need both 'query' and 'sessionid'
    
  try {
      //var query_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/FORM/INPUT[1][@value]";
      // var query_xpath = "//DIV[@id='locator']/FORM/INPUT[@name='query'][@value]";

    var query_xpath = "//INPUT[@name='query'][@value]";
    var query = utilities.gatherElementsOnXPath(doc, doc, query_xpath)[0].value;

    utilities.debugPrint("query" + query);


    try {
	// var session_xpath = "//DIV/TABLE/TBODY/TR/TD/DIV/FORM/INPUT[2][@value]";
	// var session_xpath = "//DIV[@id='locator']/FORM/INPUT[@name='sessionid'][@value]";

	var session_xpath = "//INPUT[@name='sessionid'][@value]";
	var session = utilities.gatherElementsOnXPath(doc, doc, session_xpath)[0].value;
	    
      // construct holding url from all bits and add it to pages to harvest
	    
      var holdings_url = holding_query_prefix + "query=" + query + "&sessionid=" + session + "&recno=1&zip=" + zipin;
	    


      utilities.debugPrint("sessionid:" + session);
      utilities.debugPrint("zipin:" + zipin);
      utilities.debugPrint(holdings_url);

      // utilities.debugPrint(holdings_url);
	    
      pageURLs.unshift(holdings_url);
	    
    } catch (e) {
      // no session, do nothing
    }
	
  } catch (e) {
    // no query, do nothing
  }
    
} catch (e) {
  // no zip code entered, do nothing
}


//=========================================================
// The following code kickstarts the scraping process.

var uri = doc.location.href;


utilities.debugPrint("boo");


utilities.processDocuments(browser, doc, gatherPageURLs(doc), 
			   function(d, cont) { scrapeAPage(d, cont); },
			   function() { geoHelper.lookupAddresses(done,done); },
			   function(e, url) { alert("Error scraping data from " + url + "\n" + e); }
			   );

// don't navigate to collected data just yet

wait(); 

