// Miles Davis wikipedia discography scraper
// http://www.bbc.co.uk/cgi-perl/music/muze/index.pl?site=music&action=discography&artist_id=7762
// $Id: $ , eric miller, em@potlach.org
//
// open issues: 
// 
// - specific to Miles Davis BBC structure; not sure how general
//   this can be for other BBC discographies

var RDFNS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
var DCNS = 'http://purl.org/dc/elements/1.1/';
var MBNS = 'http://musicbrainz.org/vocab#';

var pageURLs = [];

var namespace = doc.documentElement.namespaceURI;

var nsResolver = namespace ? function(prefix) {
    if (prefix == 'x') return namespace; else return null;
} : null;

var getNode = function(doc, contextNode, xpath, nsResolver) {
    return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
}
    
var cleanString = function(s) {
    return utilities.trimString(s);
}

var scrapeAPage = function(doc, fDonePageCont) {
    
    // utilities.debugPrint("harvesting page: " + doc.location.href);

    if (doc.location.href.indexOf("http://www.bbc.co.uk/") >=0) { 
	
	// only scrape wikipedia record pages 
	
	scrapeRecord(doc);         
    }
    
    // You must call this function before returning
    // so that the rest of the process can complete
    
    fDonePageCont();
}


var scrapeRecord = function (doc) {

    utilities.debugPrint("scraping record: " + doc.location.href);

    // album cover

    try {

	var cover_xpath = '/html/body/table/tbody/tr/td[2]/table/tbody/tr[1]/td[2]/table/tbody/tr[1]/td[@class="content"]/table/tbody/tr[1]/td[2]/img';

	var cover_uri = cleanString(getNode(doc, doc, cover_xpath, nsResolver).src);

	utilities.debugPrint("album cover uri: " + cover_uri);

	model.addStatement(doc.location.href, MBNS + 'coverImage', cover_uri, false); 

    } catch (e) { 		  
	utilities.debugPrint(e);
    }
}

var gatherRecordURLs = function(doc) {

  // This function should return an array of URLs (strings)
  // of other pages to scrape (e.g., subsequent search results
  // pages). It should not include the URL of the current
  // page. If there is no other page to scrape, you do not have
  // to change this function.

  // return [];

  var currentURL = doc.location.href;

  try {
      
      // get records that are part of the discography
            
      var xpath = '/html/body/table/tbody/tr/td[2]/table/tbody/tr[1]/td[2]/table/tbody/tr[1]/td[@class="content"]/div[@class="contentholding"]/div/table/tbody/tr/td[2]/div[@class="bullet1"]/font/a';

      var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);

      for (var i = 0; i < elmts.length; i++) {
            
	  var elmt = elmts[i];
	  
	  try {
	      
	      // get record names
	      
	      var uri = 'http://www.bbc.co.uk' + cleanString(getNode(doc, elmt, './@href', nsResolver).nodeValue);

	      // utilities.debugPrint("record: " + uri);

	      // add record

	      model.addStatement(uri, RDFNS + 'type', MBNS + 'Record', false); 
  
	      try {
		  
		  var title = cleanString(getNode(doc, elmt, './text()[1]', nsResolver).nodeValue);
		  
		  model.addStatement(uri, DCNS + 'title', title, true);
		  
		  // utilities.debugPrint("record title: " + title);
		  
	      } catch (e) { 
		  
		  utilities.debugPrint(e);
	      }


	      // add recorded year as date

	      try {
		  
		  var date_xpath = '/html/body/table/tbody/tr/td[2]/table/tbody/tr[1]/td[2]/table/tbody/tr[1]/td[@class="content"]/div[@class="contentholding"]/div/table/tbody/tr[' + (i + 1) + ']/td[1]/font[@class="genRM"]/text()[1]';
		  var date = cleanString(getNode(doc, doc, date_xpath, nsResolver).nodeValue);
		  
		  utilities.debugPrint("date string: " + date);
		  
		  model.addStatement(uri, DCNS + 'date', date, true);
		  
	      } catch (e) { 
		  
		  utilities.debugPrint(e);
	      }
	      

	      // add link to harvest more detail of record 
	      
	      pageURLs.unshift(uri);

	      
	  } catch (e) {
	      
	      utilities.debugPrint(e);
	      utilities.debugPrint("error getting record: " + i);
	  } 
      }
      
  } catch (e) {
      
      utilities.debugPrint("error getting record list ...");
  }
  
  return pageURLs;
}

//=========================================================
// The following code kickstarts the scraping process.

//
// var uri = doc.location.href;


utilities.processDocuments(browser, doc, gatherRecordURLs(doc), 
			   function(d, cont) { scrapeAPage(d, cont); },
			   done,
			   function(e, url) { alert("Error scraping data from " + url + "\n" + e); }
			   );

// don't navigate to collected data just yet

wait();

