// NSBI XML pubmed javascript scraper
// $Id: $ , eric miller, em@potlach.org
//
// still to-do: 
// 
// - model article <-> journal publication relationship
//
// - extract and assiociate relavant gene and protien information with article
//
// - enchance skos model of mesh terms
//

var RDFNS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
var DCNS = "http://purl.org/dc/elements/1.1/";
var DCTNS = "http://purl.org/dc/terms/";
var FOAFNS = "http://xmlns.com/foaf/0.1/";
var SKOSNS = "http://www.w3.org/2004/02/skos/core#";
var PMNS = "http://example.com/vocab#";
var MESHNS = "http://www.nlm.nih.gov/mesh/vocab#";
var GENENS = "http://uniprot.org/gene/vocab#";


var pmid = "";
var uri = "";
var meshroot = "http://www.nlm.nih.gov/mesh/term/";
var generoot = "http://purl.org/lsid/gene/";
var pubmedroot = "http://purl.org/lsid/ncbi/";
var journalroot = "http://purl.org/lsid/nlm/journal/";

var pbtool = "&tool=piggybank&email=em@potlach.org";


var getNode = function(doc, contextNode, xpath) {
    return doc.evaluate(xpath, contextNode, null, XPathResult.ANY_TYPE,null).iterateNext();
}
    
var cleanString = function(s) {
    return utilities.trimString(s);
}
	

var scrapeAPage = function(doc, fDonePageCont) {
    
    if (doc.location.href.indexOf("retmode=xml") >=0) { 
    
	// scrape XML surrogate information associated with record
    
	scrapeXMLRecord(doc); 
    
    } else {
    
	// scrape Pubmed record
    
	scrapeRecord(doc);
    }
  
    // You must call this function before returning
    // so that the rest of the process can complete
  
    fDonePageCont();
}

        
var scrapeXMLRecord = function (doc) {
    
    // this function scrapes the details page associated with a XML NCBI pubmed record
    
    // article title 

    // utilities.debugPrint("uri:" + uri);

    try {
	var title_xpath = "//article/articletitle";
	var title = utilities.gatherElementsOnXPath(doc, doc, title_xpath)[0].innerHTML;
	// utilities.debugPrint("title:" + title);
	model.addStatement(uri, DCNS + "title", utilities.trimString(title), true);
    } catch (e) {
	// no title availiable, do nothing
    }

    // utilities.debugPrint("FOO...");

    // article abstract

    try {
	var xpath = '//article/abstract/abstracttext';
	var xvalue = utilities.gatherElementsOnXPath(doc, doc, xpath)[0].innerHTML;
	model.addStatement(uri, DCNS + "description", utilities.trimString(xvalue), true);
    } catch (e) {
	// no abstract availiable, do nothing
    }

    // article affiliation

    try {
	var xpath = '//article/affiliation';
	var xvalue = utilities.gatherElementsOnXPath(doc, doc, xpath)[0].innerHTML;
	model.addStatement(uri, PMNS + "affiliation", utilities.trimString(xvalue), true);
    } catch (e) {
	// no affiliation availiable, do nothing
    }


    // articles authors

    try {
	var authors_xpath = '//authorlist/author';
	var authors = utilities.gatherElementsOnXPath(doc, doc, authors_xpath);
	
	// utilities.debugPrint("num authors:" + authors.length);

	var lastName = "";
	var firstName = "";
	var initials = "";

	for (var i = 0; i < authors.length; i++) {

	    var author = authors[i];

	    lastName = "";
	    firstName = "";
	    initials = "";

	    // get author lastname 

	    try {
		lastName = cleanString(getNode(doc, author, './lastname[1]/text()[1]').nodeValue);
	    } catch (e) { 
		// utilities.debugPrint(e);
	    }

	    // get author firstname, some are forename some are firstname  ?!?!?

	    try {
		firstName = cleanString(getNode(doc, author, './forename[1]/text()[1]').nodeValue);
	    } catch (e) { 
		utilities.debugPrint(e);
		// do nothing 
	    }

	    if (firstName) {

		try {
		    firstName = cleanString(getNode(doc, author, './FIRSTNAME[1]/text()[1]').nodeValue);
		} catch (e) { 
		    utilities.debugPrint(e);
		    // do nothing 
		}
	    }

	    // get authors initials

	    try {
		initials = cleanString(getNode(doc, author, './INITIALS[1]/text()[1]').nodeValue);
	    } catch (e) { 
		utilities.debugPrint(e);
		// do nothing 
	    }


	    if (lastName) {
		var aid = firstName + "_" + lastName;
		var alabel = firstName + " " + lastName;
		var creatoruri = pubmedroot + "pubmed/author/" + aid;

		// add person
		
		model.addStatement(creatoruri, RDFNS + "type", FOAFNS + "Person", false);
		model.addStatement(creatoruri, FOAFNS + "lastName", lastName, true);
		model.addStatement(creatoruri, FOAFNS + "firstName", firstName, true);
		model.addStatement(creatoruri, RDFNS + "value", alabel, true);

		// connect person to article

		model.addStatement(uri, DCNS + "creator", creatoruri,  false);
	    }

	}

    } catch (e) {
	// no author list availiable, do nothing
    }


    // extract article's MESH subjects

    try {

       	var subjects_xpath = '//meshheadinglist/meshheading';
	var subjects = utilities.gatherElementsOnXPath(doc, doc, subjects_xpath);
	
	for (var i = 0; i < subjects.length; i++) {
		    
	    var subject = subjects[i];
	    var descriptorName = "";
	    
	    try {
		descriptorName = cleanString(getNode(doc, subject, './descriptorname[1]/text()[1]').nodeValue);
	    } catch (e) { 
		utilities.debugPrint(e);
	    }

	    if (descriptorName) {
		var mtid = descriptorName.replace(/ /g, "_");
		var meshtermuri = meshroot + mtid;

		// add subject
		
		model.addStatement(meshtermuri, RDFNS + "type", SKOSNS + "Concept", false);
		model.addStatement(meshtermuri, SKOSNS + "prefLabel", descriptorName, true);
		model.addStatement(meshtermuri, RDFNS + "value", descriptorName, true);

		// connect subject to article

		model.addStatement(uri, DCNS + "subject", meshtermuri,  false);
	    }

	}

    } catch (e) {
	// no MESH subject list availiable, do nothing
    }


    // extract journal information

    try {
	var journal_xpath = '//medlinejournalinfo/medlineta';
	var journal_title = utilities.gatherElementsOnXPath(doc, doc, journal_xpath)[0].innerHTML;


	try {
	    var jid_xpath = '//medlinejournalinfo/nlmuniqueid';
	    var jid_value = utilities.gatherElementsOnXPath(doc, doc, jid_xpath)[0].innerHTML;

	    // at this point, we have enough to go on... 

	    var journal_uri = journalroot + jid_value;
	    
 	    model.addStatement(journal_uri, RDFNS + "type", PMNS + "Journal",  false);
	    model.addStatement(journal_uri, DCNS + "title", journal_title,  true);

	    // connect article to journal (i don't think this is the right relationship however)

	    model.addStatement(uri, PMNS + "isPublishedIn", journal_uri,  false);
	    
	} catch (e) {
	    // no journal ID availiable, do nothing
	    utilities.debugPrint(e);

	}

    } catch (e) {
	// no journal title availiable, do nothing
	utilities.debugPrint(e);
    }

}


var scrapeRecord = function (doc) {
    
    // this function scrapes the details page associated with a NCBI
    // pubmed record
    
    // not alot happens in this function; get most of the data about
    // this article from the XML surrogate
    
    model.addStatement(uri, RDFNS + "type", PMNS + "Article", false);     
}
	
       

//=========================================================

var pageURLs = [];

var gatherPageURLs = function(doc) {

    // This function should return an array of URLs (strings)
    // of other pages to scrape (e.g., subsequent search results
    // pages). It should not include the URL of the current
    // page. If there is no other page to scrape, you do not have
    // to change this function.

    // return [];

    var currentURL = doc.location.href;
    
    // record
    // http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=15489161&query_hl=7
    
    // pubmed also provides XML data via the eutiles
    // http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=11740561&retmode=xml

    // http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=15489161&query_hl=7
    // http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=15489161&retmode=xml

    pmid = currentURL.replace(/.*&list_uids=/, "").split(/&/)[0];    
    uri = pubmedroot + "pmid/" + pmid;

    var eutilsuri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=" + pmid + "&retmode=xml" + pbtool; 

    pageURLs.unshift(eutilsuri);

    return pageURLs;
}
  


//=========================================================
// The following code kickstarts the scraping process.

//
// var uri = doc.location.href;


utilities.processDocuments(browser, doc, gatherPageURLs(doc), 
			   function(d, cont) { scrapeAPage(d, cont); },
			   done,
			   function(e, url) { alert("Error scraping data from " + url + "\n" + e); }
			   );

// don't navigate to collected data just yet

wait(); 


