// frappr http://www.frapper.com/ javascript scraper
// $Id: $ , eric miller, em@potlach.org
//
// open issues: 
// 
// - only grabs first 50
// - vocabularies to use 

var RDFNS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
var FOAFNS = 'http://xmlns.com/foaf/0.1/';
var LOCNS = "http://simile.mit.edu/2005/05/ontologies/location#";
var FRAPPRNS = "http://potlach.org/2006/02/frapper#";

var pageURLs = [];

var onAddressLookedUp = function (itemURI, latlong) {

    // utilities.debugPrint("coordinates: " + latlong);
    
    model.addStatement(itemURI, LOCNS + "coordinates", latlong, true);
}

var geoHelper = utilities.createGeoHelper(onAddressLookedUp, utilities);

var namespace = doc.documentElement.namespaceURI;

var nsResolver = namespace ? function(prefix) {
    if (prefix == 'x') return namespace; else return null;
} : null;

var getNode = function(doc, contextNode, xpath, nsResolver) {
    return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
}
    
var cleanString = function(s) {
    return utilities.trimString(s);
}

var scrapeAPage = function(doc, fDonePageCont) {
    
    utilities.debugPrint("harvesting page: " + doc.location.href);

    if (doc.location.href.indexOf("&id=") >=0) { 
	
	// scrape details for person
	
	scrapeFrapprPerson(doc); 
        
    } else {
	
	// scrape worldcat record
	
	scrapeFrapprGroup(doc);
    }
    
    // You must call this function before returning
    // so that the rest of the process can complete
    
    fDonePageCont();
}


var scrapeFrapprGroup = function (doc) {

    try {
	
	// get group information
	
	var group_xpath = '/html/body/center/table[@class="mainpage"]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/font/b[1]';
	
	var group_name = utilities.gatherElementsOnXPath(doc, doc, group_xpath)[0].innerHTML;
	
	// utilities.debugPrint("group name: <" + group_name + ">");
	
	var link_xpath = '/html/body/center/table[@class="mainpage"]/tbody/tr[1]/td/table/tbody/tr[1]/td/table/tbody/tr/td[2]/a/font/b';
	
	var group_link = utilities.gatherElementsOnXPath(doc, doc, link_xpath)[0].innerHTML;
	
	// utilities.debugPrint("group link: <" + group_link + ">");
	
	var group_uri = 'http://www.frappr.com/group#' + group_name; 
	
	// utilities.debugPrint("group uri: <" + group_uri + ">");
	
	model.addStatement(group_uri, RDFNS + 'type', FOAFNS + 'Group', false); 
	model.addStatement(group_uri, FOAFNS + 'homepage', group_link, false);
	model.addStatement(group_uri, FOAFNS + 'name', group_name, true);
	
    } catch (e) {
	utilities.debugPrint("error getting group information ... ");
    }

}


var scrapeFrapprPerson = function (doc) {

    utilities.debugPrint("scraping page: " + doc.location.href);

    try {
	
	// get name

	var name_xpath = '/html/body/div[@class="shoutoutbox"]/b[1]/text()[1]';
	var name = cleanString(getNode(doc, doc, name_xpath, nsResolver).nodeValue);

	utilities.debugPrint("name: " + name);
	
	// construct uri from name
	
	var uri = 'http://www.frappr.com/member#' + name.replace(/ /g,"_"); // generate the item's URI here
	
	model.addStatement(uri, RDFNS + 'type', FOAFNS + 'Person', false); 
	
	// add nick
	
	model.addStatement(uri, FOAFNS + 'nick', name, true); 
	
	// get frappr page for individual member
	
	var homepage_xpath = '/html/body/div[@class="shoutoutbox"]/a[1]';
	var homepage = cleanString(getNode(doc, doc, homepage_xpath, nsResolver).href);
	
	utilities.debugPrint("homepage: " + homepage);

	model.addStatement(uri, FOAFNS + "homepage", homepage, false);
	
	// get location
	
	var location_xpath = '/html/body/div[@class="shoutoutbox"]/b/font/text()[1]';
	var location = cleanString(getNode(doc, doc, location_xpath, nsResolver).nodeValue);

	utilities.debugPrint("location: " + location);
	
	var address = utilities.trimString(location).replace(/ \(/g, ", ").replace(/\)/g, ", ").replace(/, $/g, "");	
	
	var city = address.split(/, /)[0];
	var state = address.split(/, /)[1];
	var country = address.split(/, /)[2];
	
	// add assertions to model 
	
	// utilities.debugPrint("lookup address: <" + address + ">");

	geoHelper.add(uri, address);
	
	model.addStatement(uri, LOCNS + "city", city, true);
	model.addStatement(uri, LOCNS + "state", state, true);
	
	if (country) {
	    model.addStatement(uri, LOCNS + "country", country, true);
	}
	
	// connect person to group
	
	model.addStatement(uri, FOAFNS + 'member', group_uri, false); 
	
    } catch (e) { 
	utilities.debugPrint(e);
    }

}


var gatherPageURLs = function(doc) {

  // This function should return an array of URLs (strings)
  // of other pages to scrape (e.g., subsequent search results
  // pages). It should not include the URL of the current
  // page. If there is no other page to scrape, you do not have
  // to change this function.

  // return [];

  var currentURL = doc.location.href;

  try {
      
      // get people who are part of a group
      
      var xpath = '//div[@id="memlist"]/center/table/tbody/tr';
      var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
      
      // set index + 2 to skip silly sort by headers
      
      for (var i = 2; i < elmts.length; i++) {

	  var elmt = elmts[i];
	  
	  try {
	      
	      // get frappr page for individual member
	      
	      var maplink = cleanString(getNode(doc, elmt, './td/a/@onClick', nsResolver).nodeValue);
	      
	      // use maplink id to construct link
	      
	      // http://www.frappr.com/?a=ajax&req=singlemapmarker&maptype=group&id=2116663
	      
	      var userid = maplink.replace(/javascript\:openMarker\(/g, "").replace(/\);/g, "");
	      var linkurl = "http://www.frappr.com/?a=ajax&req=singlemapmarker&maptype=group&id=" + userid;
	      
	      // get page
	      
	      pageURLs.unshift(linkurl);
	      
	  } catch (e) {
	      
	      utilities.debugPrint("error getting person: " + i);
	  }
	  
      }
      
    } catch (e) {
      
      utilities.debugPrint("error getting page list ...");
  }
  
  return pageURLs;
}

//=========================================================
// The following code kickstarts the scraping process.

utilities.debugPrint("boo");

utilities.processDocuments(browser, doc, gatherPageURLs(doc), 
			   function(d, cont) { scrapeAPage(d, cont); },
			   function() { geoHelper.lookupAddresses(done,done); },
			   function(e, url) { alert("Error scraping data from " + url + "\n" + e); }
			   );

// don't navigate to collected data just yet

wait(); 
