2.1 Workflow Introduction
This is an R Markdown
Notebook. When you execute code within the notebook, the results appear
beneath the code. The online version displays the results for one
Patient. You can visit our main page at bigcat-um.github.io/IMD-PUPY/
2.2 Clinical Biomarker Data
Loading patient specific data
Note: this step requires input from the user, to select data from
another patient as the example (QTOF_KV_184).
## Load data for all patients
patientdata <- read.csv("Data/Data_PuPyMS_QTOF_KV_BIGCAT.csv")
##Select Patient ID here (options: A through T)
patientID <- "I" ## patients B, C, P and Q do not have data on the AA panel
#if value for age is not numeric (e.g. "newborn", sample: QTOF_KV_112, don't perform as.numeric.)
if(grepl("^[A-Za-z]+$", patientdata[1,patientID], perl = T)){
print("Warning: Non-numeric age value is part of data")
if(patientdata[1,patientID] == "newborn"){
age <- as.numeric(0)
}
else{print("Warning: name for age value not recognized")}
}else{
age <- as.numeric(patientdata[1,patientID])
}
## Note there is a difference in the age categories between the two assays
##PUPY: (age in years)
if(age >= 0 && age < 12){
ref_age_PUPY = c(5,6) #age_0to1y
agerange <- "0 to 1 years"
}else if(age >= 12 && age < 60){
ref_age_PUPY = c(7,8) #age_1to5y
agerange <- "1 to 5 years"
}else if(age >= 60 && age < 192){
ref_age_PUPY = c(9,10) #age_5to16y
agerange <- "5 to 16 years"
}else if(age >= 192){
ref_age_PUPY = c(11,12) #age_16yandUp
agerange <- "16+ years"
}else{
ref_age_PUPY = c(13,14) #age_0to16
agerange <- "0 to 16 years"
}
##Urea: (age in months = m, in years = y)
if(age >= 0 && age < 1){
ref_age_urea = c(4,5) #age_0to1m
}else if(age >= 1 && age < 6){
ref_age_urea = c(6,7) #age_1to6m
}else if(age >= 6 && age < 12){
ref_age_urea = c(8,9) #age_6mto1y
}else if(age >= 12 && age < 24){
ref_age_urea = c(10,11) #age_1to2y
}else if(age >= 24 && age < 48){
ref_age_urea = c(12,13) #age_2-4y
}else if(age >= 48 && age < 120){
ref_age_urea = c(14,15) #age_4-10y
}else if(age >= 120 && age < 216){
ref_age_urea = c(16,17) #age_10-18y
}else{
ref_age_urea = c(18,19) #age_18yandUp
}
print(paste("Selected Patient ID is: " , patientID, ", age is between: ", agerange, " old"))
[1] "Selected Patient ID is: I , age is between: 0 to 1 years old"
Combine Reference + Patient data for each assay seperatly
#Load reference data (includes annotations for biomarkers)
refvalues_PUPY <- read.delim("Data/Reference_values_PUPY_noNAs.txt")
refvalues_urea <- read.delim("Data/Reference_values_Urea.txt")
# Select column based on patientID:
patient.column <- as.integer(match(patientID,names(patientdata)))
##Select Referencedata for one patient
patientID_PUPY <- as.data.frame(patientdata[4:37, c(1,patient.column)], drop=false) #Selecting data for PUPY only
patientID_urea <- as.data.frame(patientdata[42:95, c(1,patient.column)], drop=false) #Selecting data for Urea Acids (including Amino Acids) only
patientID_treatment <- as.data.frame(patientdata[97, c(1,patient.column)], drop=false) #Selecting data for Urea Acids (including Amino Acids) only
## Choose correct reference value column for further data visualization
age_patientID_PUPY <- as.data.frame(refvalues_PUPY[ , c(1:3, ref_age_PUPY[1], ref_age_PUPY[2], 15 )], drop=false) #Selecting reference data for PUPY, age: 0 to 1 year
remove(refvalues_PUPY,ref_age_PUPY)
age_patientID_urea <- as.data.frame(refvalues_urea[ , c(1:3, ref_age_urea[1], ref_age_urea[2], 20 )], drop=false) #Selecting reference data for Urea, age: 0 to 1 month
remove(refvalues_urea,ref_age_urea)
remove(patient.column)
print(paste("Relevant age data is selected for patient: " , patientID))
[1] "Relevant age data is selected for patient: I"
2.3 Pathway Models
Retrieve Metabolite, Protein and Disease Data from Pathway Models
if(!"SPARQL" %in% installed.packages()){
install.packages("SPARQL")
}
library(SPARQL)
##Connect to Endpoint WikiPathways
endpointwp <- "https://sparql.wikipathways.org/sparql"
## 1. Query metadata:
queryMetadata <-
"SELECT DISTINCT ?dataset (str(?titleLit) as ?title) ?date ?license
WHERE {
?dataset a void:Dataset ;
dcterms:title ?titleLit ;
dcterms:license ?license ;
pav:createdOn ?date .
}"
resultsMetadata <- SPARQL(endpointwp,queryMetadata,curl_args=list(useragent=R.version.string))
showresultsMetadata <- resultsMetadata$results
remove(queryMetadata, resultsMetadata)
## 2. Query Pathway data:
##Pathway Models IDs of interest:
pathways_of_Interest <- paste('"WP4224"', '"WP4225"', '"WP4571"', '"WP4595"', '"WP4583"', '"WP4584"')
queryAllContent_1 <-
"
PREFIX wp: <http://vocabularies.wikipathways.org/wp#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dcterms: <http://purl.org/dc/terms/>
#Variable selection
SELECT DISTINCT (str(?title) as ?pathwayName) ?PWID
(count(distinct ?geneProduct) AS ?GenesInPWs)
(count(distinct ?protein) AS ?ProteinsInPWs)
(count(distinct ?metaboliteNode) AS ?MetabolitesInPWs)
(count(distinct ?interactionID) AS ?RheaInPWs)
(count(distinct ?interactionMissing) AS ?NoRheaInPWs)
(count(distinct ?omim) as ?diseaseIDs)
WHERE {
VALUES ?PWID {
"
queryAllContent_2 <-
"
}
?pathway dcterms:identifier ?PWID. #Obtain the ID
#?pathway wp:ontologyTag cur:IEM .
?pathway wp:isAbout ?gpmlRDF_ID . #find the corresponding GPML link
?pathway dc:title ?title . #Obtain the title
{
?geneProduct dcterms:isPartOf ?pathway . #Only those part of PW
?geneProduct a wp:GeneProduct . #Filter for Protein DataNodes
}
UNION
{
?protein dcterms:isPartOf ?pathway . #Only those part of PW
?protein a wp:Protein . #Filter for Protein DataNodes
}
UNION
{
?metaboliteNode a wp:Metabolite . #Filter for Metabolite DataNodes
?metaboliteNode dcterms:isPartOf ?pathway . #Only those part of PW
}
UNION
{
OPTIONAL{?interaction wp:bdbRhea ?interactionID . #Find interactions with Rhea
?interaction dcterms:isPartOf ?pathway .} #Only those part of PW
}
UNION
{
OPTIONAL{?interactionMissing dcterms:isPartOf ?pathway . #Additional interactions
?interactionMissing rdf:type wp:Conversion . #Type 'metabolic conversion'
FILTER NOT EXISTS {?interactionMissing wp:bdbRhea ?interactionID . }#No Rhea
}
}
UNION {
?diseaseNode dcterms:isPartOf ?gpmlRDF_ID . #Only check for matching pathways
?diseaseNode rdf:type gpml:Label . #Only include textLabels
?diseaseNode gpml:href ?omim . #That have an href attribute
FILTER regex(?omim, \"omim.org\", \"i\") #Only keep hrefs if they contain 'omim.org'
}
} ORDER BY ASC(?pathway)
"
queryAllContent <- paste(queryAllContent_1,pathways_of_Interest,queryAllContent_2)
resultsAllContent <- SPARQL(endpointwp,queryAllContent,curl_args=list(useragent=R.version.string))
showresultsAllContent <- resultsAllContent$results
remove(queryAllContent, resultsAllContent)
#Print results Table
print(showresultsMetadata)
print(showresultsAllContent)
remove(pathways_of_Interest, queryAllContent_1, queryAllContent_2)
2.4 Selection of Relevant Biomarkers
First, the Patient data is connected to the reference data (including
identifier mapping). Second, the annotated data is checked for missing
values (NAs) and unification to ChEBI.
## First, match the names of all biomarkers to an identifier (from reference value data, these mappings have been checked manually)
library(dplyr) #needed for left_join() operation
#Note: the matching below is case sensitive (left_join from dplyr doesn't have an option for case insensitive matching)
annotated_PUPY <-
left_join(patientID_PUPY, age_patientID_PUPY, by=c("Dutch.name")) %>%
rowwise()
colnames(annotated_PUPY)[5] <- "low.Ref"
colnames(annotated_PUPY)[6] <- "high.Ref"
annotated_urea <-
left_join(patientID_urea, age_patientID_urea, by=c("Dutch.name")) %>%
rowwise()
colnames(annotated_urea)[5] <- "low.Ref"
colnames(annotated_urea)[6] <- "high.Ref"
annotated_both <- rbind(annotated_PUPY, annotated_urea) #Combining data for PUPY and urea biomarkers in one dataframe
#Remove rows which are for treatment values (Oxypurinol and allopurinol) or not measured by assays anymore(Argininosuccinic acid anhydride) by name of compound
annotated_both <- annotated_both [(!(annotated_both$Dutch.name=="oxypurinol") & !(annotated_both$Dutch.name=="allopurinol") & !(annotated_both$Dutch.name=="ASA_anhydride")),]
MissingMappings <- list()
counter = 1
for (i in 1:nrow(annotated_both)){
if(annotated_both[i,4] != ""){next}
else{
MissingMappings[counter] <- (annotated_both[i,1])
counter <- counter + 1
}
}
remove(i,counter)
CHEBIMissing <- list()
counter = 1
for (i in 1:nrow(annotated_both)){
if(!grepl('CHEBI', annotated_both[i,4]) && annotated_both[i,4] != ""){
CHEBIMissing[counter] <- (annotated_both[i,1])
counter <- counter + 1
}
else{next}
}
remove(i,counter)
remove(patientID_PUPY,patientID_urea,annotated_PUPY,annotated_urea,age_patientID_PUPY,age_patientID_urea)
print(paste("These biomarkers do not have an identfier mapping:" , paste(c(MissingMappings), collapse=', ' )))
[1] "These biomarkers do not have an identfier mapping: cyshcys"
print(paste("These biomarkers are not annotated with a ChEBI ID:" , paste(c(CHEBIMissing), collapse=', ' )))
[1] "These biomarkers are not annotated with a ChEBI ID: 2,8-dihydroxyadenine"
Second, query which biomarkers measured in the assays are not in any
pathway model:
allBiomarkers <- annotated_both$ID ##Take the ID column for all data
allBiomarkers <- allBiomarkers[grepl("^CHEBI:", allBiomarkers)] #remove Is not starting with 'CHEBI:' for consistency
library(stringr)
cleaned_CHEBI_all <- str_replace(allBiomarkers, "CHEBI:", "ch:") #update IDs to work with SPARQL query
#supercleaned_CHEBI <- str_replace(vector_CHEBI, "CHEBI:", "") #update IDs to work with SPARQL query
string_CHEBI_all <- paste(c(cleaned_CHEBI_all), collapse=' ' )
remove (cleaned_CHEBI_all)
item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
SELECT DISTINCT ?chebiMetabolite WHERE {
VALUES ?chebiMetabolite {"
item2 = "}
?pathwayRes a wp:Pathway ;
wp:organismName \'Homo sapiens\' .
?metabolite a wp:Metabolite ;
dcterms:identifier ?id ;
dcterms:isPartOf ?pathwayRes .
?metabolite wp:bdbChEBI ?chebiMetabolite.
}"
queryMissingBiomarkers_all <- paste(item1,string_CHEBI_all,item2)
remove(item1,item2)
resultsMissingBiomarkers_all <- SPARQL(endpointwp,queryMissingBiomarkers_all,curl_args=list(useragent=R.version.string))
listMissingBiomarkers_all <- c(resultsMissingBiomarkers_all$results) #safe results as list for comparison.
remove(queryMissingBiomarkers_all,resultsMissingBiomarkers_all)
CHEBI_inPWs_all <- gsub("[<https://identifiers.org/chebi/CHEBI:>]", "", listMissingBiomarkers_all) #ChEBI IDs Results IRI cleanup
CHEBI_inQuery_all <- gsub("CHEBI:", "", allBiomarkers) #ChEBI IDs Query cleanup
intersectingCHEBI_all <- setdiff(CHEBI_inQuery_all,CHEBI_inPWs_all)
string_intersectingCHEBI_all <- paste(c(intersectingCHEBI_all), collapse=', ' )
#count_biomarkers <- sapply(strsplit(string_CHEBI, " "), length)
#Find names for missing Biomarkers based on ChEBI ID (to help with data curation)
CHEBI_intersectingCHEBI_all <- paste0("CHEBI:", intersectingCHEBI_all) #update IDs to match back to data again
missingNames_all <- list()
for (j in 1:length(CHEBI_intersectingCHEBI_all)){
for (i in 1:nrow(annotated_both)){
if(annotated_both[i,4] == CHEBI_intersectingCHEBI_all[j]){
missingNames_all[j] <- annotated_both[i,3]
}
else{next}
}
}
remove(i,j)
#Save list on one string for reporting purposes
string_missingNames_all <- do.call(paste, c(as.list(missingNames_all), sep = ", "))
#Print relevant information:
if(length(intersectingCHEBI_all) == 0 ){print("All biomarkers on the assay(s) are in a pathway!")} else{
print(paste0("These biomarkers (as ChEBI IDs) are not in any pathway: " , string_intersectingCHEBI_all, "; with the following Database names: ", string_missingNames_all))}
[1] "These biomarkers (as ChEBI IDs) are not in any pathway: 35621, 70744, 89698, 27596, 50599, 86498, 17261, 49015, 61511; with the following Database names: AABA, Gly-pro, Homocystine, 3-Methyl-histidine, 1-Methyl-histidine, Hydroxylysine, N-Aspartylglucosamine, Piperideine carboxylic acid, a-Aminoadipate semialdehyde"
remove(allBiomarkers, CHEBI_inPWs_all, CHEBI_inQuery_all, CHEBI_intersectingCHEBI_all, intersectingCHEBI_all, string_CHEBI_all, string_intersectingCHEBI_all, string_missingNames_all, missingNames_all, CHEBIMissing)
Third, comparing patient data to reference data to calculate the
log(2) change value and data cleanup.
# First, convert the Patient Data column to a type "double" iso "character" (empty values will become "NA")
names(annotated_both)[names(annotated_both) == patientID] <- "i.patientData"
annotated_both$i.patientData <- as.integer(annotated_both$i.patientData)
# Second, compare the Patient Data to the reference Values (if available), and calculate an additional column called "Change".
annotated_both <- annotated_both%>%mutate(Change = case_when(
i.patientData == 0 ~ NA_real_, # Patient data is equal to zero
high.Ref == 0 ~ NA_real_, # If highRef is equal to zero (to avoid dividing by zero)
is.na(i.patientData) ~ NA_real_, # No patient data available
is.na(high.Ref) & is.na(low.Ref) & i.patientData != 0 ~ NA_real_, # If no highRef and lowRef value is available (NA), and there is PatientData, change is NA (unable to determine)
i.patientData >= low.Ref & i.patientData <= high.Ref ~ 1, #Patient data between reference values
i.patientData < low.Ref ~ i.patientData/low.Ref, #Patient data lower than lowRef value (downregulated)
i.patientData > high.Ref ~ i.patientData/high.Ref #Patient data higher than highRef value (upregulated)
))
# Third, remove all rows in the dataframe where the Change is NA .
annotated_both <- annotated_both[complete.cases(annotated_both$Change),]
# Add patient ID to column again:
names(annotated_both)[names(annotated_both) == "i.patientData"] <- patientID #Rename column to actual Patient ID for visualization
# Convert "Change column to log(2) scale
annotated_both$log.Change <- log(annotated_both$Change, 2)
#Convert Change column from scientific notation to 2 digits.
is.num <- sapply(annotated_both, is.numeric)
annotated_both[is.num] <- lapply(annotated_both[is.num], round, 2)
remove(is.num)
#Copy data for further cleanup (sort based on highest log-change value, no zero values etc.):
annotated_both_nozero <-annotated_both
#Data cleanup
annotated_both_nozero <- annotated_both_nozero [(!(annotated_both_nozero$log.Change==0)),] #Remove rows with zero's :
annotated_both_nozero <- annotated_both_nozero[order(-annotated_both_nozero$log.Change),] #Sort on log2(change)
#Print final dataset of patient before visualisation (only relevant columns):
print(annotated_both_nozero[,c(1,3,4,9)], digits = 2)
NA
Fourth, find which relevant biomarkers are in a pathway model.
##Add all IDs in a list
CHEBI_IDs <- list()
counter = 1
for (i in 1:nrow(annotated_both_nozero)){
if(grepl('CHEBI', annotated_both_nozero[i,4]) && annotated_both_nozero[i,4] != "" && annotated_both_nozero[i,9] != 0){
CHEBI_IDs[counter] <- (annotated_both_nozero[i,4])
counter <- counter + 1
}
else{next}
}
remove(i,counter)
vector_CHEBI <- unlist(CHEBI_IDs) #convert list to array, for traversing the data to a SPARQL query later on
library(stringr)
cleaned_CHEBI <- str_replace(vector_CHEBI, "CHEBI:", "ch:") #update IDs to work with SPARQL query
supercleaned_CHEBI <- str_replace(vector_CHEBI, "CHEBI:", "") #update IDs to work with SPARQL query
string_CHEBI <- paste(c(cleaned_CHEBI), collapse=' ' )
remove (CHEBI_IDs, vector_CHEBI, cleaned_CHEBI)
##Connect to Endpoint WikiPathways (if step 2.3 is skipped)
endpointwp <- "https://sparql.wikipathways.org/sparql"
##For now, split up the query in sections, to add the string_CHEBI as VALUES.
item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (count(distinct ?chebiMetabolite) AS ?CHEBIsInPWs)
where {
VALUES ?chebiMetabolite {"
item2 = " }
?datanode a wp:Metabolite ;
wp:bdbChEBI ?chebiMetabolite ;
dcterms:isPartOf ?pathwayRes .
?pathwayRes a wp:Pathway ;
wp:organismName \'Homo sapiens\' ;
dcterms:identifier ?wpid ;
dc:title ?title .
}
ORDER BY DESC(?CHEBIsInPWs)"
query_RelevantPWs <- paste(item1,string_CHEBI,item2)
remove(item1,item2)
results_RelevantPWs <- SPARQL(endpointwp,query_RelevantPWs,curl_args=list(useragent=R.version.string))
showresults_RelevantPWs <- results_RelevantPWs$results
remove(query_RelevantPWs,results_RelevantPWs)
print(paste0("There are ", nrow(annotated_both_nozero) ," biomarkers relevant for patient " , patientID, ", with ChEBI-IDs: ", string_CHEBI, "."))
[1] "There are 7 biomarkers relevant for patient I, with ChEBI-IDs: ch:58443 ch:17821 ch:17261 ch:17568 ch:16964 ch:70744 ch:57731."
Fifth, find which patient specific biomarkers are not in any
pathway.
##Find Missing Biomarkers (not part of any Human pathway model)
item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
SELECT DISTINCT ?chebiMetabolite WHERE {
VALUES ?chebiMetabolite {"
item2 = "}
?pathwayRes a wp:Pathway ;
wp:organismName 'Homo sapiens' .
?metabolite a wp:Metabolite ;
dcterms:identifier ?id ;
dcterms:isPartOf ?pathwayRes .
?metabolite wp:bdbChEBI ?chebiMetabolite.
}"
queryMissingBiomarkers <- paste(item1,string_CHEBI,item2)
remove(item1,item2)
resultsMissingBiomarkers <- SPARQL(endpointwp,queryMissingBiomarkers,curl_args=list(useragent=R.version.string))
listMissingBiomarkers <- c(resultsMissingBiomarkers$results) #safe results as list for comparison.
remove(queryMissingBiomarkers,resultsMissingBiomarkers)
CHEBI_inPWs <- gsub("[<https://identifiers.org/chebi/CHEBI:>]", "", listMissingBiomarkers) #ChEBI IDs IRI cleanup
intersectingCHEBI <- setdiff(supercleaned_CHEBI,CHEBI_inPWs)
string_intersectingCHEBI <- paste(c(intersectingCHEBI), collapse=', ' )
count_biomarkers <- sapply(strsplit(string_CHEBI, " "), length)
#Find names for missing Biomarkers based on ChEBi ID (to help with data curation)
CHEBI_intersectingCHEBI <- paste0("CHEBI:", intersectingCHEBI) #update IDs to match back to data again
missingNames <- list()
for (j in 1:length(CHEBI_intersectingCHEBI)){
for (i in 1:nrow(annotated_both_nozero)){
if(annotated_both_nozero[i,4] == CHEBI_intersectingCHEBI[j]){
missingNames[j] <- annotated_both_nozero[i,3]
}
else{next}
}
}
remove(i,j)
#Save list on one string for reporting purposes
string_missingNames <- do.call(paste, c(as.list(missingNames), sep = ", "))
#Print relevant information:
if(length(intersectingCHEBI) == 0 ){print("All relevant biomarkers are in a pathway!")} else{
print(paste0("These biomarkers (as ChEBI IDs) are not in a pathway: " , string_intersectingCHEBI, "; with the following Database names: ", string_missingNames))}
[1] "These biomarkers (as ChEBI IDs) are not in a pathway: 17261, 70744; with the following Database names: N-Aspartylglucosamine, Gly-pro"
2.5 Theoretical Biomarker Data
Read in information from IEMBase (collected manually) and select only
data relevant to sample matrix (urine) and age category (from
patient)
biomarkers_purine_all <- read.table(file = 'Data/IEM_database_info_Purine.txt', sep = '\t', header = TRUE)
biomarkers_pyrimidine_all <- read.table(file = 'Data/IEM_database_info_Pyrimidine.txt', sep = '\t', header = TRUE)
biomarkers_urea_all <- read.table(file = 'Data/IEM_database_info_Urea.txt', sep = '\t', header = TRUE)
#Summarize Purine disorders
list_diseases_purine <- unique(na.omit(biomarkers_purine_all[c(17)])) #list all unique disease protein names, #Remove NA values
#Summarize Pyrimidine disorders
list_diseases_pyrimidine <- unique(na.omit(biomarkers_pyrimidine_all[c(17)])) #list all unique disease protein names #Remove NA values
#Summarize Urea Cycle disorders
list_diseases_urea <- unique(na.omit(biomarkers_urea_all[c(17)])) #list all unique disease protein names #Remove NA values
total_list_diseases <- c(list_diseases_purine[[1]], list_diseases_pyrimidine[[1]], list_diseases_urea[[1]])
## Print amount of unique diseases for each category:
paste0("In total, there were ", length(list_diseases_purine[[1]]) ," unique purine, ", length(list_diseases_pyrimidine[[1]]), " pyrimidine and ", length(list_diseases_urea[[1]]), " urea cycle IMDs (total of ", length(total_list_diseases) ," disorders).")
[1] "In total, there were 17 unique purine, 10 pyrimidine and 8 urea cycle IMDs (total of 35 disorders)."
##PUPY: (age in years)
if(age >= 0 && age < 1){
ref_age_markers = c(10) #age_0to1m, neonatal
}else if(age >= 1 && age < 18){
ref_age_markers = c(11) #age_1mto18m, infancy
}else if(age >= 18 && age < 132){
ref_age_markers = c(12) #age_18mto11y, childhood
}else if(age >= 131 && age < 192){
ref_age_markers = c(13) #age_11to16y, adolescence
}else{
ref_age_markers = c(14) #age_16<, adulthood
}
biomarkers_purine_age <- as.data.frame(biomarkers_purine_all[, c(1,17,5:7,ref_age_markers,16)], drop=false)
biomarkers_purine_age$Category <- 1
biomarkers_pyrimidine_age <- as.data.frame(biomarkers_pyrimidine_all[, c(1,17,5:7,ref_age_markers,16)], drop=false)
biomarkers_pyrimidine_age$Category <- 2
biomarkers_urea_age <- as.data.frame(biomarkers_urea_all[, c(1,17,5:7,ref_age_markers,16)], drop=false)
biomarkers_urea_age$Category <- 3
biomarkers_three <- rbind(biomarkers_purine_age, biomarkers_pyrimidine_age, biomarkers_urea_age) #Combining data theoretical biomarkers in one dataframe
names(biomarkers_three)[6] <- "UpDown"
remove(biomarkers_purine_all, biomarkers_purine_age, biomarkers_pyrimidine_all, biomarkers_pyrimidine_age, biomarkers_urea_all, biomarkers_urea_age)
# Third, clean up the data, by selecting only entries with a HMDB ID, measured in urine, and converting non-numerical entries for up/down regulation.
biomarkers_three <- biomarkers_three [(!(is.na(biomarkers_three$HMDB))),]
biomarkers_three <- biomarkers_three [(!(is.na(biomarkers_three$Measured.in))) & ((biomarkers_three$Measured.in=="U") | (biomarkers_three$Measured.in=="Urine")), ]
#Replace non-numerical characters with values
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "-2 to -1"] <- ("-1.5"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "-2 to n"] <- ("-1"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "-1 to n"] <- ("-0.5"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "n to 2"] <- ("1"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "n to 1"] <- ("0.5"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "n"] <- ("0"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "+-"] <- ("0"))
biomarkers_three$UpDown <- as.numeric(biomarkers_three$UpDown)
##Convert old HMD's to new IDs:
for (i in 1:length((biomarkers_three$HMDB))) {
if(nchar(biomarkers_three$HMDB[i])<= 9){
biomarkers_three$HMDB[i] <- str_replace(biomarkers_three$HMDB[i], "HMDB", "HMDB00")
}
else{next}
}
remove(i)
## Print amount of unique diseases and biomarkers:
paste0("For these three classes of IMDs, ", nrow(unique(na.omit(biomarkers_three[c(5)]))) ," unique biomarkers were relevant for the sample matrix urine, representing ", nrow(unique(na.omit(biomarkers_three[c(2)]))) , " unique disorders; " , sum(is.na(biomarkers_three[c(2)])), " disorder is missing a protein name.")
[1] "For these three classes of IMDs, 26 unique biomarkers were relevant for the sample matrix urine, representing 26 unique disorders; 1 disorder is missing a protein name."
##Dynamic caption info:
df_figure_caption_biomarkers <- biomarkers_three[,c(3,5)] # Combines biomarkers_three$Symptoms with biomarkers_three$HMDB
df_figure_caption_biomarkers <- unique(df_figure_caption_biomarkers) #Only retain unique metabolite names and HMDB IDs.
df_figure_caption_biomarkers <- na.omit(df_figure_caption_biomarkers) #remove any rows with NA-values
df_figure_caption_disorders <- biomarkers_three[,c(2,1)] # Combines biomarkers_three$Protein_HGNC_name with biomarkers_three$Disorder
df_figure_caption_disorders <- unique(df_figure_caption_disorders) #Only retain unique disorder names + HGNC names
df_figure_caption_disorders <- na.omit(df_figure_caption_disorders) #remove any rows with NA-values
relevant <- biomarkers_three[,c(2,5,6,8)]
##Remove rows with NA values anywhere
relevant <- relevant[complete.cases(relevant), ]
##Remove rows with zero values anywhere
relevant <- relevant[apply(relevant!=0, 1, all),]
#Keep unique diseases and category names
diseases_categories <- relevant[,c(1,4)]
unique_disease_categories <- aggregate(diseases_categories[-1], list(diseases_categories$Protein_HGNC_name), FUN = mean, na.rm = TRUE) #Sorted alphabetically
# Load required packages for cast() function (data transformation based on certain column input.)
if (!require("reshape2")) {
install.packages("reshape2", dependencies = TRUE)
library(reshape2)
}
Loading required package: reshape2
if (!require("reshape")) {
install.packages("reshaper", dependencies = TRUE)
library(reshape)
}
Loading required package: reshape
Attaching package: ‘reshape’
The following objects are masked from ‘package:reshape2’:
colsplit, melt, recast
The following object is masked from ‘package:dplyr’:
rename
# Transform the data, using protein HGNC names as row names, HMDB-IDs as columns, and UpDown-regulation values to create matrix
protein.cast <- cast(relevant, Protein_HGNC_name~HMDB, value="UpDown", sum)
column_start = (ncol(protein.cast)+1)
for (i in 1:nrow(protein.cast)) {
if(protein.cast[i,1] == unique_disease_categories[i,1]){
protein.cast[i,column_start] <- unique_disease_categories[i,2]
}
}
remove(i)
names(protein.cast)[column_start] <- "Category"
protein.cast_ordered <- protein.cast[order(protein.cast$Category),]
lines_Category_Purine <- nrow(protein.cast_ordered[protein.cast_ordered$Category == "1",])
lines_Category_Pyrimidine <- nrow(protein.cast_ordered[protein.cast_ordered$Category == "2",])
lines_Category_Urea <- nrow(protein.cast_ordered[protein.cast_ordered$Category == "3",])
##Remove categories column again, so it will not disturb the correlation calculation
protein.cast_ordered <- subset(protein.cast_ordered, select=-c(Category))
rnames <- protein.cast_ordered[,1] #Store the protein names to use as row values in matrix later
matrix_protein.cast <- data.matrix(protein.cast_ordered) #convert the dataframe to a matrix (only including numeric values)
rownames(matrix_protein.cast) <- rnames #Add the saved protein names as rowLabels
matrix_protein.cast <- matrix_protein.cast[,-1] #Remove the first column, including count of rows from dataframe. #Sorted alphabetically
remove(biomarkers_three, diseases_categories, protein.cast, protein.cast_ordered, relevant, unique_disease_categories)
not_covered_IMDs <- setdiff(total_list_diseases,rnames)
not_covered_IMDs <- paste(not_covered_IMDs, collapse = ', ')
amount_biomarkers_matrixData <- ncol(matrix_protein.cast)
amount_diseases_matrixData <- nrow(matrix_protein.cast)
##Add disorders not covered with zero's for all columns:
#MatrixB <- rbind(MatrixA, c(10,11,12))
## Print amount of unique diseases for each category:
paste0("Taking available reference data into account and availability of a HMDB ID, a maximum of ", amount_biomarkers_matrixData ," unique biomarkers could be linked to ", amount_diseases_matrixData, " IMDs.")
[1] "Taking available reference data into account and availability of a HMDB ID, a maximum of 23 unique biomarkers could be linked to 25 IMDs."
paste0("For the following IMDs, no data was available: ", not_covered_IMDs, ".")
[1] "For the following IMDs, no data was available: TPMT, PRPPS, RRM2B, ITPA, IMPDH1, AMPD1, TYMP, TK2, DHODH."
if (!require("gplots")) {
install.packages("gplots", dependencies = TRUE)
library(gplots)
}
Loading required package: gplots
Attaching package: ‘gplots’
The following object is masked from ‘package:stats’:
lowess
if (!require("RColorBrewer")) {
install.packages("RColorBrewer", dependencies = TRUE)
library(RColorBrewer)
}
Loading required package: RColorBrewer
my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 299)
col_breaks = c(seq(-3,-1,length=100), # for blue
seq(-0.99,0.99,length=100), # for yellow
seq(1,3,length=100)) # for red
#Create a file name specific for patientID
filename <- paste0("Images/", age, "months_heatmap.png")
title_name <- paste0("Biomarker overlap for three IMD types, \n age category: ", agerange)
# creates a 5 x 5 inch image
png(filename, # create PNG for the heat map
width = 5*300, # 5 x 300 pixels
height = 5*300,
res = 280, # 300 pixels per inch
pointsize = 8) # smaller font size
output <- heatmap.2(matrix_protein.cast,
#cellnote = matrix_protein.cast, # same data set for cell labels
main = title_name, # heat map title
notecol="black", # change font color of cell labels to black
cellnote = ifelse(matrix_protein.cast != 0, matrix_protein.cast, ""), #Visualizing only label if data is not zero
RowSideColors = c( # grouping row-variables into different categories, e.g.
rep("gray", lines_Category_Purine), # Purine diseases
rep("blue", lines_Category_Pyrimidine), # Pyrimidine diseases
rep("black", lines_Category_Urea)), # Urea diseases
density.info="none", # turns off density plot inside color legend
trace="none", # turns off trace lines inside the heat map
margins =c(12,9), # widens margins around plot
col=my_palette, # use on color palette defined earlier
breaks=col_breaks, # enable color transition at specified limits
dendrogram="row", # only draw a row dendrogram
Colv="NA") # turn off column clustering
par(lend = 1) # square line ends for the color legend
legend("topright", # location of the legend on the heatmap plot
legend = c("Purine", "Pyrimidine", "Urea cycle"), # category labels
col = c("gray", "blue", "black"), # color key
title = "IMD classes", # legend title
lty= 1, # line style
lwd = 10 # line width
)
dev.off() # close the PNG device
null device
1
remove(rnames, title_name)
#Show the heatmap as output in RMD file
knitr::include_graphics(filename)
##Obtain the order of the diseases and biomarkers as appearing in the clustered heatmap:
verticalAxis <- rev(rownames(matrix_protein.cast)[output$rowInd])
horizontalAxis <- colnames(matrix_protein.cast)[output$colInd]
##New dynamic Figure captions:
###Diseases
disorderNames <- list()
for (j in 1:length(verticalAxis)){
for (i in 1:nrow(df_figure_caption_disorders)){
if(df_figure_caption_disorders[i,1] == verticalAxis[j]){
combineDiseaseHGNC <- paste0(verticalAxis[j], ': ', df_figure_caption_disorders[i,2])
disorderNames[j] <- combineDiseaseHGNC
}
else{next}
}
}
remove(i,j)
#Save list on one string for reporting purposes
ordered_disorderNames <- do.call(paste, c(as.list(disorderNames), sep = ", "))
###Biomarkers
biomarkerNames <- list()
for (j in 1:length(horizontalAxis)){
for (i in 1:nrow(df_figure_caption_biomarkers)){
if(df_figure_caption_biomarkers[i,2] == horizontalAxis[j]){
combinebiomarkerHMDB <- paste0(horizontalAxis[j], ': ', df_figure_caption_biomarkers[i,1])
biomarkerNames[j] <- combinebiomarkerHMDB
}
else{next}
}
}
remove(i,j)
#Save list on one string for reporting purposes
ordered_biomarkerNames <- do.call(paste, c(as.list(biomarkerNames), sep = ", "))
#df_figure_caption_biomarkers
#Print Figure caption (sorted diseases and unsorted metabolites)
paste("Protein names correspond to the following disorders:", ordered_disorderNames)
[1] "Protein names correspond to the following disorders: SLC25A15: Ornithine transporter deficiency, OTC: Ornithine transcarbamylase deficiency, ADA: Adenosine deaminase deficiency, SLC25A13: Citrin deficiency, ARG1: Arginase deficiency, DGUOK: Deoxyguanosine kinase deficiency, NAGS: N-Acetylglutamate synthase deficiency, CPS1: Carbamoyl phosphate synthetase I deficiency, NT5C3A: Pyrimidine 5’-nucleotidase superactivity, AGXT2: Beta-aminoisobutyrate-pyruvate transaminase deficiency, APRT: Adenine phosphoribosyltransferase deficiency, ADSL: Adenyl- succinate lyase deficiency, ATIC: AICAr transformylase/IMP cyclohydrolase deficiency, ASL: Argininosuccinic aciduria, UMPS: Orotic aciduria type I, ASS1: Citrullinemia type I, XAN2: Xanthinuria, Type II, XO: Xanthinuria, Type I, PNP: Purine nucleoside phosphorylase deficiency, DPYD: Dihydropyrimidine dehydrogenase deficiency, HPRT1_less: Kelley-Seegmiller syndrome, HPRT1: Lesch-Nyhan syndrome, PRPS1: Phosphoribosyl pyrophosphate synthetase 1 superactivity, DPYS: Dihydropyrimidinase deficiency, UPB1: Beta-ureidopropionase deficiency"
paste(". HMDB IDs resemble these metabolites:", ordered_biomarkerNames)
[1] ". HMDB IDs resemble these metabolites: HMDB0000026: N-Carbamyl-beta-alanine, HMDB0000034: Adenine, HMDB0000052: Argininosuccinate, HMDB0000071: Deoxyinosine, dIno, HMDB0000076: Dihydrouracil, HMDB0000079: Dihydrothymine, HMDB0000085: Purine nucleoside phosphorylase, HMDB0000101: Deoxyadenosine, HMDB0000143: Galactose, HMDB0000157: Hypoxanthine, HMDB0000226: Orotic acid, HMDB0000262: Thymine, HMDB0000289: Uric acid, HMDB0000292: Xanthine, HMDB0000300: Uracil, HMDB0000401: 2,8-Dihydroxyadenine, HMDB0000635: Succinylacetone, HMDB0000679: Homocitrulline, HMDB0000797: SAICA riboside, HMDB0000904: Citrulline, HMDB0000912: Succinyladenosine, HMDB0002299: beta-aminoisobutyrate, HMDB0062179: AICA riboside"
remove(total_list_diseases, amount_biomarkers_matrixData, amount_diseases_matrixData, column_start, ordered_biomarkerNames, ordered_disorderNames, list_diseases_purine, list_diseases_pyrimidine, list_diseases_urea, not_covered_IMDs)
2.6 Relevant Biomarker Overlap
## Obtain patient data
patient_biomarker_data_HMDB <- annotated_both_nozero[,c(4,7,9)]
patient_biomarker_data_HMDB <- patient_biomarker_data_HMDB[!(is.na(patient_biomarker_data_HMDB$HMDB) | patient_biomarker_data_HMDB$HMDB==""), ]
#replace data smaller than 0.05 or -0.05 with NA
patient_biomarker_data_HMDB$log.Change[findInterval(patient_biomarker_data_HMDB$log.Change, c(-0.05,0.05)) == 1L] <- NA
# Remove all rows in the dataframe where the Change is NA .
patient_biomarker_data_HMDB <- patient_biomarker_data_HMDB[complete.cases(patient_biomarker_data_HMDB$log.Change),]
#Dynamic caption for patient specific biomarkers:
df_figure_caption_biomarkers_patientspecific <- annotated_both_nozero[,c(3,7)] # Combines biomarkers_three$Symptoms with biomarkers_three$HMDB
df_figure_caption_biomarkers_patientspecific <- unique(df_figure_caption_biomarkers_patientspecific) #Only retain unique metabolite names and HMDB IDs.
df_figure_caption_biomarkers_patientspecific <- na.omit(df_figure_caption_biomarkers_patientspecific) #remove any rows with NA-values
#Convert log(2)change data to similar numeric scale as theoretical biomarker data (-3 to 3)
patient_biomarker_data_HMDB <- patient_biomarker_data_HMDB%>%mutate(biomarker.Change = case_when(
log.Change > 3 ~ 3, # Every log(2) change value larger then 3
log.Change < -3 ~ -3, # Every log(2) change value smaller then -3
TRUE ~ log.Change # Keep all other values the same
))
#Remove columns which are not needed
patient_biomarker_data_HMDB <- patient_biomarker_data_HMDB[,c(-1,-3)]
#Transpose data
patient_extension = (t(patient_biomarker_data_HMDB[,2]))
colnames(patient_extension) <- patient_biomarker_data_HMDB$HMDB
#Add label "patient' in front of ID, for clearer depiction in heatmap
if(patientID_treatment[,2] == 1){
patientLabel <- paste0("Patient_", patientID, ", treated")
}else{patientLabel <- paste0("Patient_", patientID, ", untreated")}
rownames(patient_extension) <- patientLabel
#Add patient data to matrix
matrix_protein.cast_patient <- acast(rbind(melt(matrix_protein.cast), melt(patient_extension)), X1~X2, sum)
Warning: 'as.is' should be specified by the caller; using TRUEWarning: 'as.is' should be specified by the caller; using TRUEWarning: 'as.is' should be specified by the caller; using TRUEWarning: 'as.is' should be specified by the caller; using TRUE
matrix_protein.cast_patient <- matrix_protein.cast_patient[apply(matrix_protein.cast_patient[,-1], 1, function(x) !all(x==0)),] #remove a row if all values are zero.
matrix_protein.cast_patient <- matrix_protein.cast_patient[, colSums(matrix_protein.cast_patient != 0) > 0] #remove a column if all values are zero.
#Create a file name specific for patientID
filename2 <- paste0("Images/", patientID, "_heatmap.png")
title_name2 <- paste0("Biomarker overlap for three IMD types, \n age category: ", agerange, "\n for patient:", patientID)
# creates a 5 x 5 inch image
png(filename2, # create PNG for the heat map
width = 5*400+nrow(patient_biomarker_data_HMDB)*30, # Updated based on high number of biomarkers for patient.
height = 5*300,
res = 280, # 300 pixels per inch
pointsize = 8) # smaller font size
output_patient <- heatmap.2(matrix_protein.cast_patient,
#cellnote = round(matrix_protein.cast_patient,0), # same data set for cell labels, round off to 1 decimal.
main = title_name2, # heat map title
notecol="black", # change font color of cell labels to black
cellnote = ifelse(matrix_protein.cast_patient != 0, round(matrix_protein.cast_patient,1), ""), #Visualizing only label if data is not zero
#Color the space between the data points
sepwidth=c(0.01,0.01),
sepcolor="lightgray",
colsep=1:ncol(matrix_protein.cast_patient),
rowsep=1:nrow(matrix_protein.cast_patient),
RowSideColors = c( # grouping row-variables into different categories, e.g.
rep("grey", lines_Category_Purine), # Purine diseases
rep("blue", lines_Category_Pyrimidine), # Pyrimidine diseases
rep("black", lines_Category_Urea), # Urea diseases
rep("purple", 1)) , #patientData
density.info="none", # turns off density plot inside color legend
trace="none", # turns off trace lines inside the heat map
margins =c(12,9), # widens margins around plot
col=my_palette, # use on color palette defined earlier
breaks=col_breaks, # enable color transition at specified limits
dendrogram="row", # only draw a row dendrogram
Colv="NA") #, # turn off column clustering
#na.color = "Grey") # color NA values specifically --> dOESN'T WORK :(
par(lend = 1) # square line ends for the color legend
legend("topright", # location of the legend on the heatmap plot
legend = c("Purine", "Pyrimidine", "Urea cycle", "Patient"), # category labels
col = c("gray", "blue", "black", "purple"), # color key
title = "IMD classes", # legend title
lty= 1, # line style
lwd = 10 # line width
)
dev.off() # close the PNG device
null device
1
#Show the heatmap as output in RMD file
knitr::include_graphics(filename2)
##Print names of metabolites added in heatmap originating from patient.
## Clustering order might differ, so data needs to be reloaded:
##Obtain the order of the diseases and biomarkers as appearing in the clustered heatmap:
verticalAxis_patient <- rev(rownames(matrix_protein.cast_patient)[output_patient$rowInd])
horizontalAxis_patient <- colnames(matrix_protein.cast_patient)[output_patient$colInd]
##New dynamic Figure captions:
###Diseases
disorderNames_patient <- list()
for (j in 1:length(verticalAxis_patient)){
for (i in 1:nrow(df_figure_caption_disorders)){
if(df_figure_caption_disorders[i,1] == verticalAxis_patient[j]){
combineDiseaseHGNC_patient <- paste0(verticalAxis_patient[j], ': ', df_figure_caption_disorders[i,2])
disorderNames_patient[j] <- combineDiseaseHGNC_patient
}
else{
#noDiseaseID <- paste(verticalAxis_patient[(j-1)])#print message which ID cannot be found
next}
}
}
remove(i,j)
#Save list on one string for reporting purposes
ordered_disorderNames_patient <- do.call(paste, c(as.list(disorderNames_patient), sep = ", "))
ordered_disorderNames_patient <- str_replace_all(ordered_disorderNames_patient, ' , ', ' ' )
###Theoretical Biomarkers
biomarkerNames_patient <- list()
for (j in 1:length(horizontalAxis_patient)){
for (i in 1:nrow(df_figure_caption_biomarkers)){
if(df_figure_caption_biomarkers[i,2] == horizontalAxis_patient[j]){
combinebiomarkerHMDB_patient <- paste0(horizontalAxis_patient[j], ': ', df_figure_caption_biomarkers[i,1])
biomarkerNames_patient[j] <- combinebiomarkerHMDB_patient
}
else{next}
}
}
remove(i,j)
#Save list on one string for reporting purposes
ordered_biomarkerNames_patient <- do.call(paste, c(as.list(biomarkerNames_patient), sep = ", "))
##Patient-specific Biomarkers
compare_patient_to_theoreticalMarkers <- setdiff(horizontalAxis_patient, df_figure_caption_biomarkers[,2])
biomarkerNames_patientspecific <- list()
for (j in 1:length(compare_patient_to_theoreticalMarkers)){
for (i in 1:nrow(df_figure_caption_biomarkers_patientspecific)){
if(df_figure_caption_biomarkers_patientspecific[i,2] == compare_patient_to_theoreticalMarkers[j]){
combinebiomarkerHMDB_patientspecific <- paste0(compare_patient_to_theoreticalMarkers[j], ': ', df_figure_caption_biomarkers_patientspecific[i,1])
biomarkerNames_patientspecific[j] <- combinebiomarkerHMDB_patientspecific
}
else{next}
}
}
remove(i,j)
#Save list on one string for reporting purposes
ordered_biomarkerNames_patientspecific <- do.call(paste, c(as.list(biomarkerNames_patientspecific), sep = ", "))
#df_figure_caption_biomarkers
#Print Figure caption (sorted diseases and unsorted metabolites)
paste0("Protein names correspond to the following disorders: ", ordered_disorderNames_patient, ". ")
[1] "Protein names correspond to the following disorders: SLC25A15: Ornithine transporter deficiency, OTC: Ornithine transcarbamylase deficiency, ADA: Adenosine deaminase deficiency, SLC25A13: Citrin deficiency, ARG1: Arginase deficiency, DGUOK: Deoxyguanosine kinase deficiency, NAGS: N-Acetylglutamate synthase deficiency, CPS1: Carbamoyl phosphate synthetase I deficiency, NT5C3A: Pyrimidine 5’-nucleotidase superactivity, AGXT2: Beta-aminoisobutyrate-pyruvate transaminase deficiency, APRT: Adenine phosphoribosyltransferase deficiency, ADSL: Adenyl- succinate lyase deficiency, ATIC: AICAr transformylase/IMP cyclohydrolase deficiency, ASL: Argininosuccinic aciduria, UMPS: Orotic aciduria type I, ASS1: Citrullinemia type I, XAN2: Xanthinuria, Type II, XO: Xanthinuria, Type I, PNP: Purine nucleoside phosphorylase deficiency, DPYD: Dihydropyrimidine dehydrogenase deficiency, HPRT1_less: Kelley-Seegmiller syndrome, HPRT1: Lesch-Nyhan syndrome, PRPS1: Phosphoribosyl pyrophosphate synthetase 1 superactivity, DPYS: Dihydropyrimidinase deficiency, UPB1: Beta-ureidopropionase deficiency. "
paste0("HMDB IDs resemble these metabolites: ", ordered_biomarkerNames_patient, ". ")
[1] "HMDB IDs resemble these metabolites: HMDB0000026: N-Carbamyl-beta-alanine, HMDB0000034: Adenine, HMDB0000052: Argininosuccinate, HMDB0000071: Deoxyinosine, dIno, HMDB0000076: Dihydrouracil, HMDB0000079: Dihydrothymine, HMDB0000085: Purine nucleoside phosphorylase, HMDB0000101: Deoxyadenosine, HMDB0000143: Galactose, HMDB0000157: Hypoxanthine, HMDB0000226: Orotic acid, HMDB0000262: Thymine, HMDB0000289: Uric acid, HMDB0000292: Xanthine, HMDB0000300: Uracil, HMDB0000401: 2,8-Dihydroxyadenine, HMDB0000635: Succinylacetone, HMDB0000679: Homocitrulline, HMDB0000797: SAICA riboside, HMDB0000904: Citrulline, HMDB0000912: Succinyladenosine, HMDB0002299: beta-aminoisobutyrate, HMDB0062179: AICA riboside. "
paste0("Additional biomarkers relevant for this specific patient (not part of IEMbase data): ", ordered_biomarkerNames_patientspecific, ". ")
[1] "Additional biomarkers relevant for this specific patient (not part of IEMbase data): HMDB0000489: N-Aspartylglucosamine, HMDB0000469: 5-(Hydroxymethyl)uracil, HMDB0000721: Gly-pro, HMDB0002166: (S)-Beta-aminoisobutyrate. "
remove(biomarkerNames, biomarkerNames_patient, biomarkerNames_patientspecific, df_figure_caption_biomarkers, df_figure_caption_disorders, df_figure_caption_biomarkers_patientspecific, disorderNames, disorderNames_patient, horizontalAxis, horizontalAxis_patient, verticalAxis, verticalAxis_patient, ordered_biomarkerNames_patient, ordered_biomarkerNames_patientspecific, ordered_disorderNames_patient, compare_patient_to_theoreticalMarkers, combinebiomarkerHMDB, combinebiomarkerHMDB_patient, combinebiomarkerHMDB_patientspecific, combineDiseaseHGNC, combineDiseaseHGNC_patient)
2.7 Pathway Selection
First, find relevant pathway from WikiPathways (sorted on covering
most patient specific biomarkers)
#For now, filter out Reactome PWs due to visualization issues.
item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
PREFIX cur: <http://vocabularies.wikipathways.org/wp#Curation:>
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (count(distinct ?chebiMetabolite) AS ?CHEBIsInPWs) (GROUP_CONCAT(DISTINCT fn:substring(?hgnc,37);separator=' ') AS ?Proteins) (GROUP_CONCAT(DISTINCT fn:substring(?chebiMetabolite,37);separator=' ') AS ?includedCHEBIs)
where {
VALUES ?chebiMetabolite {"
item2 = "}
?datanode a wp:Metabolite ;
wp:bdbChEBI ?chebiMetabolite ;
dcterms:isPartOf ?pathwayRes .
?pathwayRes a wp:Pathway ;
wp:organismName 'Homo sapiens' ;
dcterms:identifier ?wpid ;
dc:title ?title .
?datanode2 wp:bdbHgncSymbol ?hgnc ;
dcterms:isPartOf ?pathwayRes .
#?pathwayRes wp:ontologyTag cur:Reactome_Approved .
?pathwayRes wp:ontologyTag cur:AnalysisCollection .
}
ORDER BY DESC(?CHEBIsInPWs) "
query_CombinePWs <- paste(item1,string_CHEBI,item2)
remove(item1, item2)
results_CombinePWs <- SPARQL(endpointwp,query_CombinePWs,curl_args=list(useragent=R.version.string))
showresults_CombinePWs <- results_CombinePWs$results
remove(query_CombinePWs,results_CombinePWs)
#Print table with first 5 relevant pathways (if less than 5 are found, print only those)
if(nrow(showresults_CombinePWs) < 5){
print(showresults_CombinePWs_Second[1:nrow(showresults_CombinePWs_Second),c(2:4,6,5)])
}else{print(showresults_CombinePWs[1:5,c(2:4,6,5)])}
Select most relevant pathways
## Note for user: pick a different number here, if another pathway than the first is deemed most relevant (default is top pathway)
first_selected_pathway = 1
Second, find second pathway covering most unique biomarkers:
#First, obtain all the ChEBI IDs of the first ranked pathway, compare these to the list of remaining markers and only retain the marker which weren't covered yet
catch_chebis_firstPW <- showresults_CombinePWs[first_selected_pathway,6]
split_chebis_firstPW <- as.list(strsplit(catch_chebis_firstPW, ' ')[[1]])
split_string_CHEBI <- as.list(strsplit(string_CHEBI, ' ')[[1]])
cleaned_string_CHEBI <- as.list(str_replace(split_string_CHEBI, "ch:", ""))
compare_FirstSecond_CHEBI <- setdiff(cleaned_string_CHEBI, split_chebis_firstPW)
query_compatible_second <- as.list(paste("ch:", compare_FirstSecond_CHEBI, sep=""))
string_CHEBI_second <- paste(query_compatible_second, collapse = ' ')
remove(catch_chebis_firstPW,split_chebis_firstPW,split_string_CHEBI,cleaned_string_CHEBI,compare_FirstSecond_CHEBI,query_compatible_second)
item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
PREFIX cur: <http://vocabularies.wikipathways.org/wp#Curation:>
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (count(distinct ?chebiMetabolite) AS ?CHEBIsInPWs) (count(distinct ?chebiMetaboliteSecond) AS ?CHEBIsInPWsSecond) (GROUP_CONCAT(DISTINCT fn:substring(?hgnc,37);separator=' ') AS ?Proteins) (GROUP_CONCAT(DISTINCT fn:substring(?chebiMetabolite,37);separator=' ') AS ?includedCHEBIs)
where {
VALUES ?chebiMetabolite {"
item2 = "}
VALUES ?chebiMetaboliteSecond {"
item3 = "}
?datanode a wp:Metabolite ;
wp:bdbChEBI ?chebiMetabolite ;
dcterms:isPartOf ?pathwayRes .
?pathwayRes a wp:Pathway ;
wp:organismName 'Homo sapiens' ;
dcterms:identifier ?wpid ;
dc:title ?title .
?datanode2 wp:bdbHgncSymbol ?hgnc ;
dcterms:isPartOf ?pathwayRes .
#?pathwayRes wp:ontologyTag cur:Reactome_Approved .
?pathwayRes wp:ontologyTag cur:AnalysisCollection .
?datanode a wp:Metabolite ;
wp:bdbChEBI ?chebiMetaboliteSecond ;
dcterms:isPartOf ?pathwayRes .
}
ORDER BY DESC(?CHEBIsInPWsSecond) "
query_CombinePWs_Second <- paste(item1,string_CHEBI,item2, string_CHEBI_second, item3)
remove(item1, item2, item3)
results_CombinePWs_Second <- SPARQL(endpointwp,query_CombinePWs_Second,curl_args=list(useragent=R.version.string))
showresults_CombinePWs_Second <- results_CombinePWs_Second$results
remove(query_CombinePWs_Second,results_CombinePWs_Second)
#Print table with second match, first 5 relevant pathways
print("Second best matching pathways are:")
[1] "Second best matching pathways are:"
if(nrow(showresults_CombinePWs_Second) < 5){
print(showresults_CombinePWs_Second[1:nrow(showresults_CombinePWs_Second),c(2,3,4,5,7)])
}else{print(showresults_CombinePWs_Second[1:5,c(2,3,4,5,7)])}
Select 2nd most relevant pathways
## Note for user: pick a different number here, if another pathway than the first is deemed most relevant (default is top pathway)
second_selected_pathway = 1
Third, find third pathway covering most unique biomarkers:
#First, obtain all the ChEBI IDs of the second highest ranked pathway, compare these to the list of markers remaining and only retain the marker which weren't covered yet
catch_chebis_secondPW <- showresults_CombinePWs_Second[second_selected_pathway,7]
split_chebis_secondPW <- as.list(strsplit(catch_chebis_secondPW, ' ')[[1]])
split_string_CHEBI_second <- as.list(strsplit(string_CHEBI_second, ' ')[[1]])
cleaned_string_CHEBI_second <- as.list(str_replace(split_string_CHEBI_second, "ch:", ""))
compare_SecondThird_CHEBI <- setdiff(cleaned_string_CHEBI_second, split_chebis_secondPW)
query_compatible_third <- as.list(paste("ch:", compare_SecondThird_CHEBI, sep=""))
string_CHEBI_third <- paste(query_compatible_third, collapse = ' ')
remove(catch_chebis_secondPW,split_chebis_secondPW,split_string_CHEBI_second,cleaned_string_CHEBI_second,compare_SecondThird_CHEBI,query_compatible_third)
#Removed protein output, query times out.
item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
PREFIX cur: <http://vocabularies.wikipathways.org/wp#Curation:>
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (count(distinct ?chebiMetabolite) AS ?CHEBIsInPWs) (count(distinct ?chebiMetaboliteSecond) AS ?CHEBIsInPWsSecond) (count(distinct ?chebiMetaboliteThird) AS ?CHEBIsInPWsThird) (GROUP_CONCAT(DISTINCT fn:substring(?chebiMetabolite,37);separator=' ') AS ?includedCHEBIs)
where {
VALUES ?chebiMetabolite {"
item2 = "}
VALUES ?chebiMetaboliteSecond {"
item3 = "}
VALUES ?chebiMetaboliteThird {"
item4 = "}
?datanode a wp:Metabolite ;
wp:bdbChEBI ?chebiMetabolite ;
dcterms:isPartOf ?pathwayRes .
?pathwayRes a wp:Pathway ;
wp:organismName 'Homo sapiens' ;
dcterms:identifier ?wpid ;
dc:title ?title .
#?pathwayRes wp:ontologyTag cur:Reactome_Approved .
?pathwayRes wp:ontologyTag cur:AnalysisCollection .
?datanode a wp:Metabolite ;
wp:bdbChEBI ?chebiMetaboliteSecond ;
dcterms:isPartOf ?pathwayRes .
?datanode a wp:Metabolite ;
wp:bdbChEBI ?chebiMetaboliteThird ;
dcterms:isPartOf ?pathwayRes .
}
ORDER BY DESC(?CHEBIsInPWsThird) "
query_CombinePWs_Third <- paste(item1,string_CHEBI,item2, string_CHEBI_second, item3, string_CHEBI_third, item4)
remove(item1, item2, item3, item4)
results_CombinePWs_Third<- SPARQL(endpointwp,query_CombinePWs_Third,curl_args=list(useragent=R.version.string))
showresults_CombinePWs_Third <- results_CombinePWs_Third$results
remove(query_CombinePWs_Third,results_CombinePWs_Third)
#Print table with third match, first 5 relevant pathways (if there are 5)
##TODO: skip lines below, if there is no relevant third best PW. (patient I for example)
print("Third best matching pathways are:")
[1] "Third best matching pathways are:"
if(nrow(showresults_CombinePWs_Third) < 1){
paste("No more pathway models found")
}else if(nrow(showresults_CombinePWs_Third) < 5){
print(showresults_CombinePWs_Third[1:nrow(showresults_CombinePWs_Third),c(2,3,4:7)])
}else{print(showresults_CombinePWs_Third[1:5,c(2,3,4:7)])}
NA
Select 3nd most relevant pathways
if(nrow(showresults_CombinePWs_Third) < 1){
third_selected_pathway = 0
paste("Skipping this step")}else{
third_selected_pathway = 1 ## Note for user: pick a different number here, if another pathway than the first is deemed most relevant (default is top pathway)
}
Fourth, find which biomarkers are not covered by top three (or
two).
#First, obtain all the ChEBI IDs of the third highest ranked pathway, compare these to the list of remaining markers and only retain the marker which weren't covered yet
if(third_selected_pathway < 1){
paste("Using IDs from step 2 iso step 3, relevant pathway models are:")
split_string_CHEBI_Third <- as.list(strsplit(string_CHEBI_third, ' ')[[1]])
cleaned_string_CHEBI_Third <- as.list(str_replace(split_string_CHEBI_Third, "ch:", ""))
##Using IDs from third step, if no third PWM is selected
string_CHEBI_third_final <- paste(cleaned_string_CHEBI_Third, collapse = ', ')
#Find names for IDs which are present in a pathway model (if two pathways were selected):
CHEBI_missingNames_pathways_third <- glue::glue("CHEBI:{cleaned_string_CHEBI_Third}")
missingNames_pathways_third <- list()
for (j in 1:length(CHEBI_missingNames_pathways_third)){
for (i in 1:nrow(annotated_both_nozero)){
if(annotated_both_nozero[i,4] == CHEBI_missingNames_pathways_third[j]){
missingNames_pathways_third[j] <- annotated_both_nozero[i,3]
}
else{next}
}
}
remove(i,j)
#Save list on one string for reporting purposes
string_missingNames_pathways_third <- do.call(paste, c(as.list(missingNames_pathways_third), sep = ", "))
}else{
catch_chebis_ThirdPW <- showresults_CombinePWs_Third[third_selected_pathway,7]
split_chebis_ThirdPW <- as.list(strsplit(catch_chebis_ThirdPW, ' ')[[1]])
split_string_CHEBI_Third <- as.list(strsplit(string_CHEBI_third, ' ')[[1]])
cleaned_string_CHEBI_Third <- as.list(str_replace(split_string_CHEBI_Third, "ch:", ""))
##Find IDs still missing after selecting top 3 pathway models.
compare_ThirdFourth_CHEBI <- setdiff(cleaned_string_CHEBI_Third, split_chebis_ThirdPW)
query_compatible_fourth <- as.list(compare_ThirdFourth_CHEBI, sep="")
string_CHEBI_fourth <- paste(query_compatible_fourth, collapse = ', ')
remove(catch_chebis_ThirdPW,split_string_CHEBI_Third,compare_ThirdFourth_CHEBI)
#Find names for IDs which are present in a pathway model (if three pathways were selected):
CHEBI_missingNames_pathways <- glue::glue("CHEBI:{query_compatible_fourth}")
missingNames_pathways <- list()
for (j in 1:length(CHEBI_missingNames_pathways)){
for (i in 1:nrow(annotated_both_nozero)){
if(annotated_both_nozero[i,4] == CHEBI_missingNames_pathways[j]){
missingNames_pathways[j] <- annotated_both_nozero[i,3]
}
else{next}
}
}
remove(i,j)
#Save list on one string for reporting purposes
string_missingNames_pathways <- do.call(paste, c(as.list(missingNames_pathways), sep = ", "))
}
#Print results selected pathways
print(paste0(showresults_CombinePWs[first_selected_pathway,3], " (", showresults_CombinePWs[first_selected_pathway,2], ") [", showresults_CombinePWs[first_selected_pathway,4], "]"))
[1] "Amino acid metabolism (WP3925) [6]"
print(paste0(showresults_CombinePWs_Second[second_selected_pathway,3], " (", showresults_CombinePWs_Second[second_selected_pathway,2], ") [", showresults_CombinePWs_Second[second_selected_pathway,4], "]"))
[1] "Biomarkers for pyrimidine metabolism disorders (WP4584) [5]"
#Print results not selected biomarkers
if(third_selected_pathway == 0){
print(paste0("These identifiers are not captured in the top 2 pathways: ", string_CHEBI_third_final, ", with the following database names: ", string_missingNames_pathways_third))
}else if(third_selected_pathway == 1){
print(paste0(showresults_CombinePWs_Third[third_selected_pathway,3], " (", showresults_CombinePWs_Third[third_selected_pathway,2], ") [", showresults_CombinePWs_Third[third_selected_pathway,4], "]"))
print(paste0("These identifiers are not captured in the top 3 pathways: ", string_CHEBI_fourth, ", with the following database names: ", string_missingNames_pathways))
} else{print(paste0(showresults_CombinePWs_Third[third_selected_pathway,3], " (", showresults_CombinePWs_Third[third_selected_pathway,2], ")# [", showresults_CombinePWs_Third[third_selected_pathway,4], "]"))
print(paste0("These identifiers are not captured in the top 3 pathways: ", string_CHEBI_fourth, ", with the following database names: ", string_missingNames_pathways))}
[1] "Biochemical pathways: part I (WP3604) [2]"
[1] "These identifiers are not captured in the top 3 pathways: 18237, 57743, 15727, with the following database names: Glutamic acid, Citrulline, Carnosine"
2.8 Data Visuzalization
Preparation
Installing and loading packages
These packages need to be installed (and may take a while to
run).
Connect to cytoscape
Open the Cytoscape Program first; this step was tested on version:
3.9.1, running on Java 11.0.11.
# Make sure to have Cytoscape started on your computer (>=3.9.1)
cytoscapePing ()
cytoscapeVersionInfo ()
Install WikiPathways apps
For the opening of pathway models and enhanced visualization steps,
the WikiPathways app necessary (version 3.3.10).
if("WikiPathways" %in% commandsHelp("")) print("Success: the WikiPathways app is installed") else print("Warning: WikiPathways app is not installed. Please install the WikiPathways app before proceeding.")
if(!"WikiPathways" %in% commandsHelp("")){
installApp("WikiPathways")
}
Open pathway from WikiPathways and Visualize patient data through
log.Change
Pathway models can be imported from WikiPathways with the
WikiPathways apps (as pathway or network). Enabling the visualization of
biomarkers scattered over several pathways, requires finding the optimal
combination of two (or three) individual pathways, adding the data to
these pathways and merging the results.
Determine the scale for fill color, and convert data to dataframe
#Determine highest up/downregulated value:
highest <- max(annotated_both$log.Change)
lowest <- abs(min(annotated_both$log.Change))
if(highest < lowest){colorRange <- lowest} else{colorRange <- highest}
#Change data to dataframe before loading in Cytoscape:
finalData <- as.data.frame(annotated_both)
The 2log transformed data can now be visualized on the nodes in the
form of a color gradient from blue(downregulated) to white (0) to red
(upregulated), just as the reference data.
#Select highest relevant pathway ID (containing most biomarkers) based on previous step.
wp= commandsRun(paste0('wikipathways import-as-network id=', showresults_CombinePWs[first_selected_pathway,2])) #pick first PW to show data on.
## Linking metabolic data to ChEBI-ID column
loadTableData(finalData, data.key.column = "ID", table.key.column = "ChEBI")
#Set range of data values for visualisation:
data.values = c(-colorRange,-1,0,1,colorRange)
#display.brewer.all(length(data.values), colorblindFriendly=TRUE, type="div") # div,qual,seq,all
node.colors <- c(rev(brewer.pal(length(data.values), "RdBu")))
#Update WikiPathways pallet with Patient Data:
setNodeColorMapping("log.Change", data.values, node.colors, default.color = "#AAAAAA", style.name = "WikiPathways-As-Network")
#Download the Network as a PNG Figure, and store the figure in the Images folder.
imageURL_Name <- paste0("Images/patient_",patientID,"_network_First")
network1 <- exportImage(imageURL_Name,'PNG')
#Show the network as output in RMD file
filenameFirst<- paste0(imageURL_Name, '.png')
knitr::include_graphics(filenameFirst)
Selecting the second pathway of choice
#Select secondly highest ranked relevant PW (manual for now):
wp = commandsRun(paste0('wikipathways import-as-network id=', showresults_CombinePWs_Second[second_selected_pathway,2]))
loadTableData(finalData, data.key.column = "ID", table.key.column = "ChEBI")
#Update WikiPathways pallet with Patient Data:
setNodeColorMapping("log.Change", data.values, node.colors, default.color = "#AAAAAA", style.name = "WikiPathways-As-Network")
#Download the Network as a PNG Figure, and store the figure in the Images folder.
imageURL_Name <- paste0("Images/patient_",patientID,"_network_Second")
network2 <- exportImage(imageURL_Name,'PNG')
#Show the network as output in RMD file
filenameSecond <- paste0(imageURL_Name, '.png')
knitr::include_graphics(filenameSecond)
Selecting the third pathway of choice
if(third_selected_pathway < 1){
paste("skip this step, no third pathway selected")}else{
#Select third highest ranked relevant PW (manual for now):
wp= commandsRun(paste0('wikipathways import-as-network id=', showresults_CombinePWs_Third[third_selected_pathway,2]))
loadTableData(finalData, data.key.column = "ID", table.key.column = "ChEBI")
#Update WikiPathways pallet with Patient Data:
setNodeColorMapping("log.Change", data.values, node.colors, default.color = "#AAAAAA", style.name = "WikiPathways-As-Network")
#Download the Network as a PNG Figure, and store the figure in the Images folder.
imageURL_Name <- paste0("Images/patient_",patientID,"_network_Third")
network3 <- exportImage(imageURL_Name,'PNG')
#Show the network as output in RMD file
filenameThird <- paste0(imageURL_Name, '.png')
knitr::include_graphics(filenameThird)
}
Print missing biomarkers in visualization
#TODO print which biomarkers are not visualized automatically (now manually selected); also add to final reporting step!
missing_IDs <- paste0(annotated_both_nozero[5,4], ", " , annotated_both_nozero[10,4])
print(annotated_both_nozero[c(5,10),c(1,3,4,8)], digits = 2)
2.9 Data Interpretation
Finally: Print reporting details
###Print overview of information for each patient
print(paste("Selected Patient ID is: " , patientID, ", age is between: ", agerange, " old"))
remove(age)
print("Relevant biomarkers are:")
print(annotated_both_nozero[,c(1,3,4,8)], digits = 2)
#Print relevant biomarker information matched to pathways:
print(paste("There are", count_biomarkers ,"biomarkers relevant for patient" , patientID, ", the ChEBI-IDs are", string_CHEBI))
if(length(intersectingCHEBI) == 0 ){print("All Biomarkers are in a pathway!")} else{
print(paste("These biomarkers (as ChEBI IDs) are not in a pathway:" , string_intersectingCHEBI))}
remove(listMissingBiomarkers, CHEBI_inPWs,supercleaned_CHEBI,string_intersectingCHEBI)
#Print table with first five relevant pathways
#print(showresults_CombinePWs[1:5,c(2,3,4,6)])
###Save relevant information for each patient in dataframe (for Tables in publication)
#table6 <- data.frame(matrix(ncol=4,nrow=0, dimnames=list(NULL, c("Primary.ID", "Secondary.ID", "Tertiary.ID", "Not.covered.biomarkers"))))
##Add patient data to dataframe (note: this overwrites existing patient entries, as opposed to using rbind)
#table6[patientID, ] <- c(showresults_CombinePWs[1,2], showresults_CombinePWs_Second[1,2], showresults_CombinePWs_Third[1,2], missing_IDs)
---
title: "Systems Biology framework using metabolic markers and pathways to support the diagnosis of Purine and Pyrimidine Inherited Metabolic Disorders"
output: html_notebook
# Authors
person("Denise", "Slenter", email = "denise.slenter@maastrichtuniversity.nl, GitHub: DeniseSl22",
       role = "aut")
person("Irene", "Hemel", email = "i.hemel@maastrichtuniversity.nl, GitHub: IreneHemel",
       role = "ctb")
---

# 2.1 Workflow Introduction

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code.
The online version displays the results for one Patient.
You can visit our main page at [bigcat-um.github.io/IMD-PUPY/](https://bigcat-um.github.io/IMD-PUPY/)

```{r setup, include=FALSE}
if(!"rmarkdown" %in% installed.packages()){
  install.packages("rmarkdown")
}
library(rmarkdown)
if(!"dplyr" %in% installed.packages()){
  install.packages("dplyr")
}
library(dplyr)

##Setting working directory to local github file for Linux (note: you probably need to change the file path, if you run this workflow on a Windows computer):
knitr::opts_chunk$set(echo = TRUE)
#setwd("~/git/IMD")
knitr::opts_knit$set(root.dir = "~/git/IMD-PUPY")

##Check if the working directory is the top folder of the repository (called IMD)
getwd()
```

# 2.2 Clinical Biomarker Data

### Loading patient specific data

Note: this step requires input from the user, to select data from another patient as the example (QTOF_KV_184).

```{r}
## Load data for all patients
patientdata <- read.csv("Data/Data_PuPyMS_QTOF_KV_BIGCAT.csv")

##Select Patient ID here (options: A through T)
patientID <- "I" ## patients B, C, P and Q do not have data on the AA panel
#if value for age is not numeric (e.g. "newborn", sample: QTOF_KV_112, don't perform as.numeric.)
if(grepl("^[A-Za-z]+$", patientdata[1,patientID], perl = T)){
  print("Warning: Non-numeric age value is part of data")
  if(patientdata[1,patientID] == "newborn"){
    age <- as.numeric(0)
  }
  else{print("Warning: name for age value not recognized")}
}else{
    age <- as.numeric(patientdata[1,patientID])
  }
## Note there is a difference in the age categories between the two assays

##PUPY: (age in years)
if(age >= 0 && age < 12){
  ref_age_PUPY = c(5,6)  #age_0to1y
  agerange <- "0 to 1 years"
  }else if(age >= 12 && age < 60){
  ref_age_PUPY  = c(7,8) #age_1to5y
  agerange <- "1 to 5 years"
  }else if(age >= 60 && age < 192){
  ref_age_PUPY  = c(9,10) #age_5to16y
  agerange <- "5 to 16 years"
  }else if(age >= 192){
  ref_age_PUPY = c(11,12) #age_16yandUp
  agerange <- "16+ years"
  }else{
    ref_age_PUPY = c(13,14) #age_0to16
    agerange <- "0 to 16 years"
    }

##Urea: (age in months = m, in years = y)
if(age >= 0 && age < 1){
  ref_age_urea = c(4,5) #age_0to1m
  }else if(age >= 1 && age < 6){
  ref_age_urea = c(6,7) #age_1to6m
  }else if(age >= 6 && age < 12){
  ref_age_urea = c(8,9) #age_6mto1y
  }else if(age >= 12 && age < 24){
  ref_age_urea = c(10,11) #age_1to2y
  }else if(age >= 24 && age < 48){
  ref_age_urea = c(12,13) #age_2-4y
  }else if(age >= 48 && age < 120){
  ref_age_urea = c(14,15) #age_4-10y
  }else if(age >= 120 && age < 216){
  ref_age_urea = c(16,17) #age_10-18y
  }else{
    ref_age_urea = c(18,19) #age_18yandUp
    }

print(paste("Selected Patient ID is: " , patientID, ", age is between: ", agerange, " old"))

```

### Combine Reference + Patient data for each assay seperatly

```{r}
#Load reference data (includes annotations for biomarkers)
refvalues_PUPY <- read.delim("Data/Reference_values_PUPY_noNAs.txt")
refvalues_urea <- read.delim("Data/Reference_values_Urea.txt")

# Select column based on patientID:
patient.column <- as.integer(match(patientID,names(patientdata)))

##Select Referencedata for one patient 
patientID_PUPY <- as.data.frame(patientdata[4:37, c(1,patient.column)],  drop=false) #Selecting data for PUPY only
patientID_urea <- as.data.frame(patientdata[42:95, c(1,patient.column)],  drop=false) #Selecting data for Urea Acids (including Amino Acids) only
patientID_treatment <- as.data.frame(patientdata[97, c(1,patient.column)],  drop=false) #Selecting data for Urea Acids (including Amino Acids) only

## Choose correct reference value column for further data visualization
age_patientID_PUPY <- as.data.frame(refvalues_PUPY[ , c(1:3, ref_age_PUPY[1], ref_age_PUPY[2], 15 )],  drop=false) #Selecting reference data for PUPY, age: 0 to 1 year
remove(refvalues_PUPY,ref_age_PUPY)
age_patientID_urea <- as.data.frame(refvalues_urea[ , c(1:3, ref_age_urea[1], ref_age_urea[2], 20 )],  drop=false) #Selecting reference data for Urea, age: 0 to 1 month
remove(refvalues_urea,ref_age_urea)
remove(patient.column)

print(paste("Relevant age data is selected for patient: " , patientID))

```

# 2.3 Pathway Models

Retrieve Metabolite, Protein and Disease Data from Pathway Models

```{r}

if(!"SPARQL" %in% installed.packages()){
  install.packages("SPARQL")
}
library(SPARQL)

##Connect to Endpoint WikiPathways
endpointwp <- "https://sparql.wikipathways.org/sparql"

## 1. Query metadata:

queryMetadata <-
"SELECT DISTINCT ?dataset (str(?titleLit) as ?title) ?date ?license 
WHERE {
   ?dataset a void:Dataset ;
   dcterms:title ?titleLit ;
   dcterms:license ?license ;
   pav:createdOn ?date .
 }"

resultsMetadata <- SPARQL(endpointwp,queryMetadata,curl_args=list(useragent=R.version.string))
showresultsMetadata <- resultsMetadata$results
remove(queryMetadata, resultsMetadata)

## 2. Query Pathway data:

##Pathway Models IDs of interest:
pathways_of_Interest <- paste('"WP4224"', '"WP4225"', '"WP4571"', '"WP4595"', '"WP4583"', '"WP4584"')

queryAllContent_1 <-
"
PREFIX wp:      <http://vocabularies.wikipathways.org/wp#>
PREFIX rdfs:    <http://www.w3.org/2000/01/rdf-schema#>
PREFIX dcterms: <http://purl.org/dc/terms/>

#Variable selection
SELECT DISTINCT (str(?title) as ?pathwayName) ?PWID 
(count(distinct ?geneProduct) AS ?GenesInPWs)
(count(distinct ?protein) AS ?ProteinsInPWs) 
(count(distinct ?metaboliteNode) AS ?MetabolitesInPWs) 
(count(distinct ?interactionID) AS ?RheaInPWs) 
(count(distinct ?interactionMissing) AS ?NoRheaInPWs)
(count(distinct ?omim) as ?diseaseIDs)

WHERE {
  
  VALUES ?PWID {
"

queryAllContent_2 <-
"
}
    ?pathway dcterms:identifier ?PWID. #Obtain the ID
    #?pathway wp:ontologyTag cur:IEM .
    ?pathway wp:isAbout ?gpmlRDF_ID . #find the corresponding GPML link     
    ?pathway dc:title ?title . #Obtain the title   

      {
    ?geneProduct dcterms:isPartOf ?pathway . #Only those part of PW             
    ?geneProduct a wp:GeneProduct . #Filter for Protein DataNodes    
    } 
    UNION
    {
    ?protein dcterms:isPartOf ?pathway . #Only those part of PW             
    ?protein a wp:Protein . #Filter for Protein DataNodes    
    } 
    UNION
    { 
    ?metaboliteNode a wp:Metabolite . #Filter for Metabolite DataNodes
    ?metaboliteNode dcterms:isPartOf ?pathway . #Only those part of PW
    }
    UNION 
    { 
    OPTIONAL{?interaction wp:bdbRhea ?interactionID . #Find interactions with Rhea
    ?interaction dcterms:isPartOf ?pathway .} #Only those part of PW
    }
    UNION
    {
    OPTIONAL{?interactionMissing dcterms:isPartOf ?pathway . #Additional interactions
    ?interactionMissing rdf:type wp:Conversion . #Type 'metabolic conversion'
    FILTER NOT EXISTS {?interactionMissing wp:bdbRhea ?interactionID . }#No Rhea   
    }  
    }
    UNION {
    ?diseaseNode dcterms:isPartOf ?gpmlRDF_ID . #Only check for matching pathways 
    ?diseaseNode rdf:type gpml:Label . #Only include textLabels
    ?diseaseNode gpml:href ?omim . #That have an href attribute  
    FILTER regex(?omim, \"omim.org\", \"i\") #Only keep hrefs if they contain 'omim.org'
    }
      
} ORDER BY ASC(?pathway)
"
queryAllContent <- paste(queryAllContent_1,pathways_of_Interest,queryAllContent_2)
resultsAllContent <- SPARQL(endpointwp,queryAllContent,curl_args=list(useragent=R.version.string))
showresultsAllContent <- resultsAllContent$results

remove(queryAllContent, resultsAllContent)

#Print results Table
print(showresultsMetadata)
print(showresultsAllContent)

remove(pathways_of_Interest, queryAllContent_1, queryAllContent_2)
```

# 2.4 Selection of Relevant Biomarkers

First, the Patient data is connected to the reference data (including identifier mapping). Second, the annotated data is checked for missing values (NAs) and unification to ChEBI.
```{r}
## First, match the names of all biomarkers to an identifier (from reference value data, these mappings have been checked manually)
library(dplyr) #needed for left_join() operation
#Note: the matching below is case sensitive (left_join from dplyr doesn't have an option for case insensitive matching)
annotated_PUPY <-
left_join(patientID_PUPY, age_patientID_PUPY, by=c("Dutch.name")) %>%
  rowwise()  
  colnames(annotated_PUPY)[5] <- "low.Ref"
  colnames(annotated_PUPY)[6] <- "high.Ref"

annotated_urea <-
left_join(patientID_urea, age_patientID_urea, by=c("Dutch.name")) %>%
  rowwise()  
  colnames(annotated_urea)[5] <- "low.Ref"
  colnames(annotated_urea)[6] <- "high.Ref"

annotated_both <- rbind(annotated_PUPY, annotated_urea) #Combining data for PUPY and urea biomarkers in one dataframe

#Remove rows which are for treatment values (Oxypurinol and allopurinol) or not measured by assays anymore(Argininosuccinic acid anhydride) by name of compound
annotated_both <- annotated_both [(!(annotated_both$Dutch.name=="oxypurinol") & !(annotated_both$Dutch.name=="allopurinol") & !(annotated_both$Dutch.name=="ASA_anhydride")),]

MissingMappings <- list()
counter = 1
for (i in 1:nrow(annotated_both)){
  if(annotated_both[i,4] != ""){next}
  else{
    MissingMappings[counter] <- (annotated_both[i,1])
    counter <- counter + 1
   }
}
remove(i,counter)

CHEBIMissing <- list()
counter = 1
for (i in 1:nrow(annotated_both)){
  if(!grepl('CHEBI', annotated_both[i,4]) && annotated_both[i,4] != ""){
    CHEBIMissing[counter] <- (annotated_both[i,1])
    counter <- counter + 1
    }
  else{next}
}
remove(i,counter)
remove(patientID_PUPY,patientID_urea,annotated_PUPY,annotated_urea,age_patientID_PUPY,age_patientID_urea)
print(paste("These biomarkers do not have an identfier mapping:" , paste(c(MissingMappings), collapse=', ' )))
print(paste("These biomarkers are not annotated with a ChEBI ID:" , paste(c(CHEBIMissing), collapse=', ' )))
  
```

Second, query which biomarkers measured in the assays are not in any pathway model:
```{r}
allBiomarkers <- annotated_both$ID ##Take the ID column for all data
allBiomarkers <- allBiomarkers[grepl("^CHEBI:", allBiomarkers)] #remove Is not starting with 'CHEBI:' for consistency
library(stringr)
cleaned_CHEBI_all <- str_replace(allBiomarkers, "CHEBI:", "ch:") #update IDs to work with SPARQL query
#supercleaned_CHEBI <- str_replace(vector_CHEBI, "CHEBI:", "") #update IDs to work with SPARQL query
string_CHEBI_all <- paste(c(cleaned_CHEBI_all), collapse=' ' )
remove (cleaned_CHEBI_all)

item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
SELECT DISTINCT ?chebiMetabolite WHERE {
  VALUES ?chebiMetabolite {"
item2 = "}
  ?pathwayRes  a wp:Pathway ;
             	wp:organismName \'Homo sapiens\' .
  
  ?metabolite 	a wp:Metabolite ;
                dcterms:identifier ?id ;
                dcterms:isPartOf ?pathwayRes .

  ?metabolite wp:bdbChEBI ?chebiMetabolite.
}"

queryMissingBiomarkers_all <- paste(item1,string_CHEBI_all,item2)
remove(item1,item2)

resultsMissingBiomarkers_all <- SPARQL(endpointwp,queryMissingBiomarkers_all,curl_args=list(useragent=R.version.string))
listMissingBiomarkers_all <- c(resultsMissingBiomarkers_all$results) #safe results as list for comparison.
remove(queryMissingBiomarkers_all,resultsMissingBiomarkers_all)

CHEBI_inPWs_all <- gsub("[<https://identifiers.org/chebi/CHEBI:>]", "", listMissingBiomarkers_all) #ChEBI IDs Results IRI cleanup
CHEBI_inQuery_all <- gsub("CHEBI:", "", allBiomarkers) #ChEBI IDs Query cleanup
intersectingCHEBI_all <- setdiff(CHEBI_inQuery_all,CHEBI_inPWs_all)
string_intersectingCHEBI_all <- paste(c(intersectingCHEBI_all), collapse=', ' )
#count_biomarkers <- sapply(strsplit(string_CHEBI, " "), length)

#Find names for missing Biomarkers based on ChEBI ID (to help with data curation)
CHEBI_intersectingCHEBI_all <- paste0("CHEBI:", intersectingCHEBI_all) #update IDs to match back to data again
missingNames_all <- list()
for (j in 1:length(CHEBI_intersectingCHEBI_all)){
  for (i in 1:nrow(annotated_both)){
    if(annotated_both[i,4] == CHEBI_intersectingCHEBI_all[j]){
       missingNames_all[j] <- annotated_both[i,3]
      }
    else{next}
  }
}
remove(i,j)
#Save list on one string for reporting purposes
string_missingNames_all <- do.call(paste, c(as.list(missingNames_all), sep = ", "))

#Print relevant information:
if(length(intersectingCHEBI_all) == 0 ){print("All biomarkers on the assay(s) are in a pathway!")} else{
  print(paste0("These biomarkers (as ChEBI IDs) are not in any pathway: " , string_intersectingCHEBI_all, "; with the following Database names: ", string_missingNames_all))}

remove(allBiomarkers, CHEBI_inPWs_all, CHEBI_inQuery_all, CHEBI_intersectingCHEBI_all, intersectingCHEBI_all, string_CHEBI_all, string_intersectingCHEBI_all, string_missingNames_all, missingNames_all, CHEBIMissing)
```

Third, comparing patient data to reference data to calculate the log(2) change value and data cleanup.
```{r}
# First, convert the Patient Data column to a type "double" iso "character" (empty values will become "NA")
names(annotated_both)[names(annotated_both) == patientID] <- "i.patientData" 
annotated_both$i.patientData <- as.integer(annotated_both$i.patientData)

# Second, compare the Patient Data to the reference Values (if available), and calculate an additional column called "Change".
annotated_both <- annotated_both%>%mutate(Change = case_when(
  i.patientData == 0 ~ NA_real_,                    # Patient data is equal to zero
  high.Ref == 0 ~ NA_real_,                    # If highRef is equal to zero (to avoid dividing by zero)
  is.na(i.patientData) ~ NA_real_,                  # No patient data available
  is.na(high.Ref) & is.na(low.Ref) & i.patientData != 0 ~ NA_real_,   # If no highRef and lowRef value is available (NA), and there is PatientData, change is NA (unable to determine)
 i.patientData >= low.Ref & i.patientData <= high.Ref ~ 1,  #Patient data between reference values
 i.patientData < low.Ref ~ i.patientData/low.Ref, #Patient data lower than lowRef value (downregulated) 
 i.patientData > high.Ref ~ i.patientData/high.Ref   #Patient data higher than highRef value (upregulated)
))

# Third, remove all rows in the dataframe where the Change is NA .
annotated_both <- annotated_both[complete.cases(annotated_both$Change),]

# Add patient ID to column again:
names(annotated_both)[names(annotated_both) == "i.patientData"] <- patientID #Rename column to actual Patient ID for visualization
# Convert "Change column to log(2) scale
annotated_both$log.Change <- log(annotated_both$Change, 2)
#Convert Change column from scientific notation to 2 digits.
is.num <- sapply(annotated_both, is.numeric)
annotated_both[is.num] <- lapply(annotated_both[is.num], round, 2)
remove(is.num)

#Copy data for further cleanup (sort based on highest log-change value, no zero values etc.):
annotated_both_nozero <-annotated_both
#Data cleanup
annotated_both_nozero <- annotated_both_nozero [(!(annotated_both_nozero$log.Change==0)),] #Remove rows with zero's :
annotated_both_nozero <- annotated_both_nozero[order(-annotated_both_nozero$log.Change),]  #Sort on log2(change)
#Print final dataset of patient before visualisation (only relevant columns):
print(annotated_both_nozero[,c(1,3,4,9)], digits = 2)

```

Fourth, find which relevant biomarkers are in a pathway model.
```{r}
##Add all IDs in a list
CHEBI_IDs <- list()
counter = 1
for (i in 1:nrow(annotated_both_nozero)){
  if(grepl('CHEBI', annotated_both_nozero[i,4]) && annotated_both_nozero[i,4] != "" && annotated_both_nozero[i,9] != 0){
    CHEBI_IDs[counter] <- (annotated_both_nozero[i,4])
    counter <- counter + 1
    }
  else{next}
}
remove(i,counter)
vector_CHEBI <- unlist(CHEBI_IDs) #convert list to array, for traversing the data to a SPARQL query later on
library(stringr)
cleaned_CHEBI <- str_replace(vector_CHEBI, "CHEBI:", "ch:") #update IDs to work with SPARQL query
supercleaned_CHEBI <- str_replace(vector_CHEBI, "CHEBI:", "") #update IDs to work with SPARQL query
string_CHEBI <- paste(c(cleaned_CHEBI), collapse=' ' )
remove (CHEBI_IDs, vector_CHEBI, cleaned_CHEBI)

##Connect to Endpoint WikiPathways (if step 2.3 is skipped)
endpointwp <- "https://sparql.wikipathways.org/sparql"

##For now, split up the query in sections, to add the string_CHEBI as VALUES.
item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (count(distinct ?chebiMetabolite) AS ?CHEBIsInPWs)
where {
VALUES ?chebiMetabolite {"
item2 = " }
 
 ?datanode	a wp:Metabolite ;          
           	wp:bdbChEBI ?chebiMetabolite ;
    		dcterms:isPartOf ?pathwayRes .

 ?pathwayRes a wp:Pathway ;
             wp:organismName \'Homo sapiens\' ; 
    		dcterms:identifier ?wpid ;
    		dc:title ?title .
}

ORDER BY DESC(?CHEBIsInPWs)"

query_RelevantPWs <- paste(item1,string_CHEBI,item2)
remove(item1,item2)

results_RelevantPWs <- SPARQL(endpointwp,query_RelevantPWs,curl_args=list(useragent=R.version.string))
showresults_RelevantPWs <- results_RelevantPWs$results
remove(query_RelevantPWs,results_RelevantPWs)

print(paste0("There are ", nrow(annotated_both_nozero) ," biomarkers relevant for patient " , patientID, ", with ChEBI-IDs: ", string_CHEBI, "."))

```

Fifth, find which patient specific biomarkers are not in any pathway.
```{r}
##Find Missing Biomarkers (not part of any Human pathway model)
item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
SELECT DISTINCT ?chebiMetabolite WHERE {
  VALUES ?chebiMetabolite {"
item2 = "}
  ?pathwayRes  a wp:Pathway ;
             	wp:organismName 'Homo sapiens' .
  
  ?metabolite 	a wp:Metabolite ;
                dcterms:identifier ?id ;
                dcterms:isPartOf ?pathwayRes .

  ?metabolite wp:bdbChEBI ?chebiMetabolite.
}"

queryMissingBiomarkers <- paste(item1,string_CHEBI,item2)
remove(item1,item2)

resultsMissingBiomarkers <- SPARQL(endpointwp,queryMissingBiomarkers,curl_args=list(useragent=R.version.string))
listMissingBiomarkers <- c(resultsMissingBiomarkers$results) #safe results as list for comparison.
remove(queryMissingBiomarkers,resultsMissingBiomarkers)

CHEBI_inPWs <- gsub("[<https://identifiers.org/chebi/CHEBI:>]", "", listMissingBiomarkers) #ChEBI IDs IRI cleanup
intersectingCHEBI <- setdiff(supercleaned_CHEBI,CHEBI_inPWs)
string_intersectingCHEBI <- paste(c(intersectingCHEBI), collapse=', ' )
count_biomarkers <- sapply(strsplit(string_CHEBI, " "), length)

#Find names for missing Biomarkers based on ChEBi ID (to help with data curation)
CHEBI_intersectingCHEBI <- paste0("CHEBI:", intersectingCHEBI) #update IDs to match back to data again
missingNames <- list()
for (j in 1:length(CHEBI_intersectingCHEBI)){
  for (i in 1:nrow(annotated_both_nozero)){
    if(annotated_both_nozero[i,4] == CHEBI_intersectingCHEBI[j]){
       missingNames[j] <- annotated_both_nozero[i,3]
      }
    else{next}
  }
}
remove(i,j)
#Save list on one string for reporting purposes
string_missingNames <- do.call(paste, c(as.list(missingNames), sep = ", "))

#Print relevant information:
if(length(intersectingCHEBI) == 0 ){print("All relevant biomarkers are in a pathway!")} else{
  print(paste0("These biomarkers (as ChEBI IDs) are not in a pathway: " , string_intersectingCHEBI, "; with the following Database names: ", string_missingNames))}
```

# 2.5 Theoretical Biomarker Data

Read in information from IEMBase (collected manually) and select only data relevant to sample matrix (urine) and age category (from patient)

```{r}
biomarkers_purine_all <- read.table(file = 'Data/IEM_database_info_Purine.txt', sep = '\t', header = TRUE)
biomarkers_pyrimidine_all <- read.table(file = 'Data/IEM_database_info_Pyrimidine.txt', sep = '\t', header = TRUE)
biomarkers_urea_all <- read.table(file = 'Data/IEM_database_info_Urea.txt', sep = '\t', header = TRUE)

#Summarize Purine disorders
list_diseases_purine <- unique(na.omit(biomarkers_purine_all[c(17)])) #list all unique disease protein names,  #Remove NA values
#Summarize Pyrimidine disorders
list_diseases_pyrimidine <- unique(na.omit(biomarkers_pyrimidine_all[c(17)])) #list all unique disease protein names #Remove NA values
#Summarize Urea Cycle disorders
list_diseases_urea <- unique(na.omit(biomarkers_urea_all[c(17)])) #list all unique disease protein names #Remove NA values

total_list_diseases <- c(list_diseases_purine[[1]], list_diseases_pyrimidine[[1]], list_diseases_urea[[1]])

## Print amount of unique diseases for each category:
paste0("In total, there were ", length(list_diseases_purine[[1]]) ," unique purine, ", length(list_diseases_pyrimidine[[1]]), " pyrimidine and ", length(list_diseases_urea[[1]]), " urea cycle IMDs (total of ", length(total_list_diseases) ," disorders).")

##PUPY: (age in years)
if(age >= 0 && age < 1){
  ref_age_markers = c(10)  #age_0to1m, neonatal
  }else if(age >= 1 && age < 18){
  ref_age_markers  = c(11) #age_1mto18m, infancy
  }else if(age >= 18 && age < 132){
  ref_age_markers  = c(12) #age_18mto11y, childhood
  }else if(age >= 131 && age < 192){
  ref_age_markers = c(13) #age_11to16y, adolescence
  }else{
    ref_age_markers = c(14) #age_16<, adulthood
  }

biomarkers_purine_age <- as.data.frame(biomarkers_purine_all[, c(1,17,5:7,ref_age_markers,16)],  drop=false) 
biomarkers_purine_age$Category <- 1
biomarkers_pyrimidine_age <- as.data.frame(biomarkers_pyrimidine_all[, c(1,17,5:7,ref_age_markers,16)],  drop=false) 
biomarkers_pyrimidine_age$Category <- 2
biomarkers_urea_age <- as.data.frame(biomarkers_urea_all[, c(1,17,5:7,ref_age_markers,16)],  drop=false) 
biomarkers_urea_age$Category <- 3

biomarkers_three <- rbind(biomarkers_purine_age, biomarkers_pyrimidine_age, biomarkers_urea_age) #Combining data theoretical biomarkers in one dataframe
names(biomarkers_three)[6] <- "UpDown"
remove(biomarkers_purine_all, biomarkers_purine_age, biomarkers_pyrimidine_all, biomarkers_pyrimidine_age, biomarkers_urea_all, biomarkers_urea_age)

# Third, clean up the data, by selecting only entries with a HMDB ID, measured in urine, and converting non-numerical entries for up/down regulation.
biomarkers_three <- biomarkers_three  [(!(is.na(biomarkers_three$HMDB))),]
biomarkers_three <- biomarkers_three  [(!(is.na(biomarkers_three$Measured.in))) & ((biomarkers_three$Measured.in=="U") | (biomarkers_three$Measured.in=="Urine")), ]

#Replace non-numerical characters with values
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "-2 to -1"] <- ("-1.5")) 
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "-2 to n"] <- ("-1"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "-1 to n"] <- ("-0.5"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "n to 2"] <- ("1"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "n to 1"] <- ("0.5"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "n"] <- ("0"))
biomarkers_three <- within(biomarkers_three, UpDown[UpDown == "+-"] <- ("0"))

biomarkers_three$UpDown <- as.numeric(biomarkers_three$UpDown)

##Convert old HMD's to new IDs:
for (i in 1:length((biomarkers_three$HMDB))) {
  if(nchar(biomarkers_three$HMDB[i])<= 9){
    biomarkers_three$HMDB[i] <- str_replace(biomarkers_three$HMDB[i], "HMDB", "HMDB00")
  }
  else{next}
}
remove(i)

## Print amount of unique diseases and biomarkers:
paste0("For these three classes of IMDs, ", nrow(unique(na.omit(biomarkers_three[c(5)]))) ," unique biomarkers were relevant for the sample matrix urine, representing ",  nrow(unique(na.omit(biomarkers_three[c(2)]))) , " unique disorders; " , sum(is.na(biomarkers_three[c(2)])), " disorder is missing a protein name.")

##Dynamic caption info:
df_figure_caption_biomarkers <- biomarkers_three[,c(3,5)] # Combines biomarkers_three$Symptoms with biomarkers_three$HMDB
df_figure_caption_biomarkers <- unique(df_figure_caption_biomarkers) #Only retain unique metabolite names and HMDB IDs.
df_figure_caption_biomarkers <- na.omit(df_figure_caption_biomarkers) #remove any rows with NA-values
df_figure_caption_disorders <- biomarkers_three[,c(2,1)] # Combines biomarkers_three$Protein_HGNC_name with biomarkers_three$Disorder
df_figure_caption_disorders <- unique(df_figure_caption_disorders) #Only retain unique disorder names + HGNC names
df_figure_caption_disorders <- na.omit(df_figure_caption_disorders) #remove any rows with NA-values

relevant <- biomarkers_three[,c(2,5,6,8)]

##Remove rows with NA values anywhere
relevant <- relevant[complete.cases(relevant), ]
##Remove rows with zero values anywhere
relevant <- relevant[apply(relevant!=0, 1, all),]

#Keep unique diseases and category names
diseases_categories <- relevant[,c(1,4)]
unique_disease_categories <- aggregate(diseases_categories[-1], list(diseases_categories$Protein_HGNC_name), FUN = mean, na.rm = TRUE) #Sorted alphabetically

# Load required packages for cast() function (data transformation based on certain column input.)
if (!require("reshape2")) {
   install.packages("reshape2", dependencies = TRUE)
   library(reshape2)
   }
if (!require("reshape")) {
   install.packages("reshaper", dependencies = TRUE)
   library(reshape)
   }

# Transform the data, using protein HGNC names as row names, HMDB-IDs as columns, and UpDown-regulation values to create matrix
protein.cast <- cast(relevant, Protein_HGNC_name~HMDB, value="UpDown", sum)
column_start = (ncol(protein.cast)+1)
for (i in 1:nrow(protein.cast)) {
  if(protein.cast[i,1] == unique_disease_categories[i,1]){
    protein.cast[i,column_start] <- unique_disease_categories[i,2]
  }
}
remove(i)
names(protein.cast)[column_start] <- "Category"
protein.cast_ordered <- protein.cast[order(protein.cast$Category),]

lines_Category_Purine <- nrow(protein.cast_ordered[protein.cast_ordered$Category == "1",])
lines_Category_Pyrimidine <- nrow(protein.cast_ordered[protein.cast_ordered$Category == "2",])
lines_Category_Urea <- nrow(protein.cast_ordered[protein.cast_ordered$Category == "3",])

##Remove categories column again, so it will not disturb the correlation calculation
protein.cast_ordered <- subset(protein.cast_ordered, select=-c(Category))
rnames <- protein.cast_ordered[,1] #Store the protein names to use as row values in matrix later
matrix_protein.cast <- data.matrix(protein.cast_ordered) #convert the dataframe to a matrix (only including numeric values)
rownames(matrix_protein.cast) <- rnames #Add the saved protein names as rowLabels
matrix_protein.cast <- matrix_protein.cast[,-1] #Remove the first column, including count of rows from dataframe. #Sorted alphabetically

remove(biomarkers_three, diseases_categories, protein.cast, protein.cast_ordered, relevant, unique_disease_categories)

not_covered_IMDs <- setdiff(total_list_diseases,rnames)
not_covered_IMDs <- paste(not_covered_IMDs, collapse = ', ')
amount_biomarkers_matrixData <- ncol(matrix_protein.cast)
amount_diseases_matrixData <- nrow(matrix_protein.cast)

##Add disorders not covered with zero's for all columns:
#MatrixB <- rbind(MatrixA, c(10,11,12))  

## Print amount of unique diseases for each category:
paste0("Taking available reference data into account and availability of a HMDB ID, a maximum of ", amount_biomarkers_matrixData ," unique biomarkers could be linked to ", amount_diseases_matrixData, " IMDs.")
paste0("For the following IMDs, no data was available: ", not_covered_IMDs, ".")

if (!require("gplots")) {
   install.packages("gplots", dependencies = TRUE)
   library(gplots)
   }
if (!require("RColorBrewer")) {
   install.packages("RColorBrewer", dependencies = TRUE)
   library(RColorBrewer)
   }

my_palette <- colorRampPalette(c("blue", "white", "red"))(n = 299)

col_breaks = c(seq(-3,-1,length=100),  # for blue
  seq(-0.99,0.99,length=100),           # for yellow
  seq(1,3,length=100))             # for red

#Create a file name specific for patientID
filename <- paste0("Images/", age, "months_heatmap.png")
title_name <- paste0("Biomarker overlap for three IMD types, \n age category: ", agerange)

# creates a 5 x 5 inch image
png(filename,    # create PNG for the heat map        
  width = 5*300,        # 5 x 300 pixels
  height = 5*300,
  res = 280,            # 300 pixels per inch
  pointsize = 8)        # smaller font size

output <- heatmap.2(matrix_protein.cast,
  #cellnote = matrix_protein.cast,  # same data set for cell labels
  main = title_name, # heat map title
  notecol="black",      # change font color of cell labels to black
  cellnote = ifelse(matrix_protein.cast != 0, matrix_protein.cast, ""), #Visualizing only label if data is not zero
  
  RowSideColors = c(    # grouping row-variables into different categories, e.g.
     rep("gray", lines_Category_Purine),   #  Purine diseases
     rep("blue", lines_Category_Pyrimidine),    # Pyrimidine diseases
     rep("black", lines_Category_Urea)),    # Urea diseases
  
  density.info="none",  # turns off density plot inside color legend
  trace="none",         # turns off trace lines inside the heat map
  margins =c(12,9),     # widens margins around plot
  col=my_palette,       # use on color palette defined earlier
  breaks=col_breaks,    # enable color transition at specified limits
  dendrogram="row",     # only draw a row dendrogram
  Colv="NA")            # turn off column clustering

par(lend = 1)           # square line ends for the color legend
legend("topright",      # location of the legend on the heatmap plot
    legend = c("Purine", "Pyrimidine", "Urea cycle"), # category labels
    col = c("gray", "blue", "black"), # color key
    title = "IMD classes",           # legend title
    lty= 1,             # line style
    lwd = 10            # line width
)
 

dev.off()               # close the PNG device

remove(rnames, title_name)

#Show the heatmap as output in RMD file
knitr::include_graphics(filename)

##Obtain the order of the diseases and biomarkers as appearing in the clustered heatmap:
verticalAxis <- rev(rownames(matrix_protein.cast)[output$rowInd])
horizontalAxis <- colnames(matrix_protein.cast)[output$colInd]

##New dynamic Figure captions:
###Diseases
disorderNames <- list()

for (j in 1:length(verticalAxis)){
  for (i in 1:nrow(df_figure_caption_disorders)){
    if(df_figure_caption_disorders[i,1] == verticalAxis[j]){
      combineDiseaseHGNC <- paste0(verticalAxis[j], ': ', df_figure_caption_disorders[i,2])
       disorderNames[j] <- combineDiseaseHGNC
      }
    else{next}
  }
}
remove(i,j)
#Save list on one string for reporting purposes
ordered_disorderNames <- do.call(paste, c(as.list(disorderNames), sep = ", "))

###Biomarkers
biomarkerNames <- list()

for (j in 1:length(horizontalAxis)){
  for (i in 1:nrow(df_figure_caption_biomarkers)){
    if(df_figure_caption_biomarkers[i,2] == horizontalAxis[j]){
      combinebiomarkerHMDB <- paste0(horizontalAxis[j], ': ', df_figure_caption_biomarkers[i,1])
       biomarkerNames[j] <- combinebiomarkerHMDB
      }
    else{next}
  }
}
remove(i,j)
#Save list on one string for reporting purposes
ordered_biomarkerNames <- do.call(paste, c(as.list(biomarkerNames), sep = ", "))

#df_figure_caption_biomarkers

#Print Figure caption (sorted diseases and unsorted metabolites)
paste("Protein names correspond to the following disorders:", ordered_disorderNames)
paste(". HMDB IDs resemble these metabolites:", ordered_biomarkerNames)

remove(total_list_diseases, amount_biomarkers_matrixData, amount_diseases_matrixData, column_start, ordered_biomarkerNames, ordered_disorderNames, list_diseases_purine, list_diseases_pyrimidine, list_diseases_urea, not_covered_IMDs)
```

# 2.6 Relevant Biomarker Overlap

```{r}
## Obtain patient data
patient_biomarker_data_HMDB <- annotated_both_nozero[,c(4,7,9)]
patient_biomarker_data_HMDB <- patient_biomarker_data_HMDB[!(is.na(patient_biomarker_data_HMDB$HMDB) | patient_biomarker_data_HMDB$HMDB==""), ]
#replace data smaller than 0.05 or -0.05 with NA
patient_biomarker_data_HMDB$log.Change[findInterval(patient_biomarker_data_HMDB$log.Change, c(-0.05,0.05)) == 1L] <- NA
# Remove all rows in the dataframe where the Change is NA .
patient_biomarker_data_HMDB <- patient_biomarker_data_HMDB[complete.cases(patient_biomarker_data_HMDB$log.Change),]

#Dynamic caption for patient specific biomarkers:
df_figure_caption_biomarkers_patientspecific <- annotated_both_nozero[,c(3,7)] # Combines biomarkers_three$Symptoms with biomarkers_three$HMDB
df_figure_caption_biomarkers_patientspecific <- unique(df_figure_caption_biomarkers_patientspecific) #Only retain unique metabolite names and HMDB IDs.
df_figure_caption_biomarkers_patientspecific <- na.omit(df_figure_caption_biomarkers_patientspecific) #remove any rows with NA-values

#Convert log(2)change data to similar numeric scale as theoretical biomarker data (-3 to 3)
patient_biomarker_data_HMDB <- patient_biomarker_data_HMDB%>%mutate(biomarker.Change = case_when(
 log.Change > 3 ~ 3,         # Every log(2) change value larger then 3
 log.Change < -3 ~ -3,       # Every log(2) change value smaller then -3
 TRUE ~ log.Change           # Keep all other values the same
))

#Remove columns which are not needed
patient_biomarker_data_HMDB <- patient_biomarker_data_HMDB[,c(-1,-3)]
#Transpose data
patient_extension = (t(patient_biomarker_data_HMDB[,2]))
colnames(patient_extension) <- patient_biomarker_data_HMDB$HMDB

#Add label "patient' in front of ID, for clearer depiction in heatmap
if(patientID_treatment[,2] == 1){ 
  patientLabel <- paste0("Patient_", patientID, ", treated")
  }else{patientLabel <- paste0("Patient_", patientID, ", untreated")}

rownames(patient_extension) <- patientLabel

#Add patient data to matrix
matrix_protein.cast_patient <- acast(rbind(melt(matrix_protein.cast), melt(patient_extension)), X1~X2, sum) 
matrix_protein.cast_patient <- matrix_protein.cast_patient[apply(matrix_protein.cast_patient[,-1], 1, function(x) !all(x==0)),] #remove a row if all values are zero.
matrix_protein.cast_patient <- matrix_protein.cast_patient[, colSums(matrix_protein.cast_patient != 0) > 0] #remove a column if all values are zero.

#Create a file name specific for patientID
filename2 <- paste0("Images/", patientID, "_heatmap.png")
title_name2 <- paste0("Biomarker overlap for three IMD types, \n age category: ", agerange, "\n for patient:", patientID)

# creates a 5 x 5 inch image
png(filename2,    # create PNG for the heat map        
  width = 5*400+nrow(patient_biomarker_data_HMDB)*30, # Updated based on high number of biomarkers for patient.
  height = 5*300,
  res = 280,            # 300 pixels per inch
  pointsize = 8)        # smaller font size

output_patient <- heatmap.2(matrix_protein.cast_patient,
  #cellnote = round(matrix_protein.cast_patient,0),  # same data set for cell labels, round off to 1 decimal.
  main = title_name2, # heat map title
  notecol="black",      # change font color of cell labels to black
  cellnote = ifelse(matrix_protein.cast_patient != 0, round(matrix_protein.cast_patient,1), ""), #Visualizing only label if data is not zero
  
  #Color the space between the data points
  sepwidth=c(0.01,0.01),
  sepcolor="lightgray",
  colsep=1:ncol(matrix_protein.cast_patient),
  rowsep=1:nrow(matrix_protein.cast_patient),
  
  RowSideColors = c(    # grouping row-variables into different categories, e.g.
     rep("grey", lines_Category_Purine),   #  Purine diseases
     rep("blue", lines_Category_Pyrimidine),    # Pyrimidine diseases
     rep("black", lines_Category_Urea),    # Urea diseases
     rep("purple", 1)) , #patientData
  
  density.info="none",  # turns off density plot inside color legend
  trace="none",         # turns off trace lines inside the heat map
  margins =c(12,9),     # widens margins around plot
  col=my_palette,       # use on color palette defined earlier
  breaks=col_breaks,    # enable color transition at specified limits
  dendrogram="row",     # only draw a row dendrogram
  Colv="NA") #,            # turn off column clustering
  #na.color = "Grey")   # color NA values specifically --> dOESN'T WORK :(
  
par(lend = 1)           # square line ends for the color legend
legend("topright",      # location of the legend on the heatmap plot
    legend = c("Purine", "Pyrimidine", "Urea cycle", "Patient"), # category labels
    col = c("gray", "blue", "black", "purple"), # color key
    title = "IMD classes",           # legend title
    lty= 1,             # line style
    lwd = 10            # line width
)
 

dev.off()               # close the PNG device

#Show the heatmap as output in RMD file
knitr::include_graphics(filename2)

##Print names of metabolites added in heatmap originating from patient.
## Clustering order might differ, so data needs to be reloaded:

##Obtain the order of the diseases and biomarkers as appearing in the clustered heatmap:
verticalAxis_patient <- rev(rownames(matrix_protein.cast_patient)[output_patient$rowInd])
horizontalAxis_patient <- colnames(matrix_protein.cast_patient)[output_patient$colInd]

##New dynamic Figure captions:
###Diseases
disorderNames_patient <- list()

for (j in 1:length(verticalAxis_patient)){
  for (i in 1:nrow(df_figure_caption_disorders)){
    if(df_figure_caption_disorders[i,1] == verticalAxis_patient[j]){
      combineDiseaseHGNC_patient <- paste0(verticalAxis_patient[j], ': ', df_figure_caption_disorders[i,2])
       disorderNames_patient[j] <- combineDiseaseHGNC_patient
      }
    else{
      #noDiseaseID <- paste(verticalAxis_patient[(j-1)])#print message which ID cannot be found
      next}
  }
}
remove(i,j)
#Save list on one string for reporting purposes
ordered_disorderNames_patient <- do.call(paste, c(as.list(disorderNames_patient), sep = ", "))
ordered_disorderNames_patient <- str_replace_all(ordered_disorderNames_patient, ' , ', ' ' )

###Theoretical Biomarkers
biomarkerNames_patient <- list()

for (j in 1:length(horizontalAxis_patient)){
  for (i in 1:nrow(df_figure_caption_biomarkers)){
    if(df_figure_caption_biomarkers[i,2] == horizontalAxis_patient[j]){
      combinebiomarkerHMDB_patient <- paste0(horizontalAxis_patient[j], ': ', df_figure_caption_biomarkers[i,1])
       biomarkerNames_patient[j] <- combinebiomarkerHMDB_patient
      }
    else{next}
  }
}
remove(i,j)
#Save list on one string for reporting purposes
ordered_biomarkerNames_patient <- do.call(paste, c(as.list(biomarkerNames_patient), sep = ", "))

##Patient-specific Biomarkers
compare_patient_to_theoreticalMarkers <- setdiff(horizontalAxis_patient, df_figure_caption_biomarkers[,2])

biomarkerNames_patientspecific <- list()

for (j in 1:length(compare_patient_to_theoreticalMarkers)){
  for (i in 1:nrow(df_figure_caption_biomarkers_patientspecific)){
    if(df_figure_caption_biomarkers_patientspecific[i,2] == compare_patient_to_theoreticalMarkers[j]){
      combinebiomarkerHMDB_patientspecific <- paste0(compare_patient_to_theoreticalMarkers[j], ': ', df_figure_caption_biomarkers_patientspecific[i,1])
       biomarkerNames_patientspecific[j] <- combinebiomarkerHMDB_patientspecific
      }
    else{next}
  }
}
remove(i,j)
#Save list on one string for reporting purposes
ordered_biomarkerNames_patientspecific <- do.call(paste, c(as.list(biomarkerNames_patientspecific), sep = ", "))

#df_figure_caption_biomarkers

#Print Figure caption (sorted diseases and unsorted metabolites)
paste0("Protein names correspond to the following disorders: ", ordered_disorderNames_patient, ". ")
paste0("HMDB IDs resemble these metabolites: ", ordered_biomarkerNames_patient, ". ")
paste0("Additional biomarkers relevant for this specific patient (not part of IEMbase data): ", ordered_biomarkerNames_patientspecific, ". ")

remove(biomarkerNames, biomarkerNames_patient, biomarkerNames_patientspecific, df_figure_caption_biomarkers, df_figure_caption_disorders, df_figure_caption_biomarkers_patientspecific, disorderNames, disorderNames_patient, horizontalAxis, horizontalAxis_patient, verticalAxis, verticalAxis_patient, ordered_biomarkerNames_patient, ordered_biomarkerNames_patientspecific, ordered_disorderNames_patient, compare_patient_to_theoreticalMarkers, combinebiomarkerHMDB, combinebiomarkerHMDB_patient, combinebiomarkerHMDB_patientspecific, combineDiseaseHGNC, combineDiseaseHGNC_patient)

```

# 2.7 Pathway Selection

First, find relevant pathway from WikiPathways (sorted on covering most patient specific biomarkers)

```{r}
#For now, filter out Reactome PWs due to visualization issues.
item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
PREFIX cur: <http://vocabularies.wikipathways.org/wp#Curation:>
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (count(distinct ?chebiMetabolite) AS ?CHEBIsInPWs) (GROUP_CONCAT(DISTINCT fn:substring(?hgnc,37);separator=' ') AS ?Proteins) (GROUP_CONCAT(DISTINCT fn:substring(?chebiMetabolite,37);separator=' ') AS ?includedCHEBIs)
where {
VALUES ?chebiMetabolite {"

item2 = "}
 
 ?datanode	a wp:Metabolite ;          
           	wp:bdbChEBI ?chebiMetabolite ;
    		dcterms:isPartOf ?pathwayRes .

 ?pathwayRes a wp:Pathway ;
             wp:organismName 'Homo sapiens' ; 
    		dcterms:identifier ?wpid ;
    		dc:title ?title .
    		
 ?datanode2 wp:bdbHgncSymbol ?hgnc ;
    		dcterms:isPartOf ?pathwayRes .
    		
  #?pathwayRes wp:ontologyTag cur:Reactome_Approved . 
  ?pathwayRes wp:ontologyTag cur:AnalysisCollection .   		
}

ORDER BY DESC(?CHEBIsInPWs) "

query_CombinePWs <- paste(item1,string_CHEBI,item2)
remove(item1, item2)
results_CombinePWs <- SPARQL(endpointwp,query_CombinePWs,curl_args=list(useragent=R.version.string))
showresults_CombinePWs <- results_CombinePWs$results
remove(query_CombinePWs,results_CombinePWs)

#Print table with first 5 relevant pathways (if less than 5 are found, print only those)
if(nrow(showresults_CombinePWs) < 5){
print(showresults_CombinePWs_Second[1:nrow(showresults_CombinePWs_Second),c(2:4,6,5)])
}else{print(showresults_CombinePWs[1:5,c(2:4,6,5)])}
```

Select most relevant pathways

```{r}
## Note for user: pick a different number here, if another pathway than the first is deemed most relevant (default is top pathway)
first_selected_pathway = 1
```

Second, find second pathway covering most unique biomarkers:

```{r}
#First, obtain all the ChEBI IDs of the first ranked pathway, compare these to the list of remaining markers and only retain the marker which weren't covered yet
catch_chebis_firstPW <- showresults_CombinePWs[first_selected_pathway,6]
split_chebis_firstPW <- as.list(strsplit(catch_chebis_firstPW, ' ')[[1]])
split_string_CHEBI <- as.list(strsplit(string_CHEBI, ' ')[[1]])
cleaned_string_CHEBI <- as.list(str_replace(split_string_CHEBI, "ch:", ""))
compare_FirstSecond_CHEBI <- setdiff(cleaned_string_CHEBI, split_chebis_firstPW)
query_compatible_second <- as.list(paste("ch:", compare_FirstSecond_CHEBI, sep=""))
string_CHEBI_second <- paste(query_compatible_second, collapse = ' ')
remove(catch_chebis_firstPW,split_chebis_firstPW,split_string_CHEBI,cleaned_string_CHEBI,compare_FirstSecond_CHEBI,query_compatible_second)

item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
PREFIX cur: <http://vocabularies.wikipathways.org/wp#Curation:>
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (count(distinct ?chebiMetabolite) AS ?CHEBIsInPWs) (count(distinct ?chebiMetaboliteSecond) AS ?CHEBIsInPWsSecond) (GROUP_CONCAT(DISTINCT fn:substring(?hgnc,37);separator=' ') AS ?Proteins) (GROUP_CONCAT(DISTINCT fn:substring(?chebiMetabolite,37);separator=' ') AS ?includedCHEBIs)
where {
VALUES ?chebiMetabolite {"

item2 = "}
VALUES ?chebiMetaboliteSecond {"

item3 = "}
 
 ?datanode	a wp:Metabolite ;          
           	wp:bdbChEBI ?chebiMetabolite ;
    		dcterms:isPartOf ?pathwayRes .

 ?pathwayRes a wp:Pathway ;
             wp:organismName 'Homo sapiens' ; 
    		dcterms:identifier ?wpid ;
    		dc:title ?title .
    		
 ?datanode2 wp:bdbHgncSymbol ?hgnc ;
    		dcterms:isPartOf ?pathwayRes .
    		
  #?pathwayRes wp:ontologyTag cur:Reactome_Approved . 
  ?pathwayRes wp:ontologyTag cur:AnalysisCollection .   
  
  ?datanode	a wp:Metabolite ;          
           	wp:bdbChEBI ?chebiMetaboliteSecond ;
    		dcterms:isPartOf ?pathwayRes .
}

ORDER BY DESC(?CHEBIsInPWsSecond) "

query_CombinePWs_Second <- paste(item1,string_CHEBI,item2, string_CHEBI_second, item3)
remove(item1, item2, item3)

results_CombinePWs_Second <- SPARQL(endpointwp,query_CombinePWs_Second,curl_args=list(useragent=R.version.string))
showresults_CombinePWs_Second <- results_CombinePWs_Second$results
remove(query_CombinePWs_Second,results_CombinePWs_Second)

#Print table with second match, first 5 relevant pathways
print("Second best matching pathways are:")
if(nrow(showresults_CombinePWs_Second) < 5){
print(showresults_CombinePWs_Second[1:nrow(showresults_CombinePWs_Second),c(2,3,4,5,7)])
}else{print(showresults_CombinePWs_Second[1:5,c(2,3,4,5,7)])}
```

Select 2nd most relevant pathways

```{r}
## Note for user: pick a different number here, if another pathway than the first is deemed most relevant (default is top pathway)
second_selected_pathway = 1
```

Third, find third pathway covering most unique biomarkers:

```{r}
#First, obtain all the ChEBI IDs of the second highest ranked pathway, compare these to the list of markers remaining and only retain the marker which weren't covered yet
catch_chebis_secondPW <- showresults_CombinePWs_Second[second_selected_pathway,7]
split_chebis_secondPW <- as.list(strsplit(catch_chebis_secondPW, ' ')[[1]])
split_string_CHEBI_second <- as.list(strsplit(string_CHEBI_second, ' ')[[1]])
cleaned_string_CHEBI_second <- as.list(str_replace(split_string_CHEBI_second, "ch:", ""))
compare_SecondThird_CHEBI <- setdiff(cleaned_string_CHEBI_second, split_chebis_secondPW)
query_compatible_third <- as.list(paste("ch:", compare_SecondThird_CHEBI, sep=""))
string_CHEBI_third <- paste(query_compatible_third, collapse = ' ')
remove(catch_chebis_secondPW,split_chebis_secondPW,split_string_CHEBI_second,cleaned_string_CHEBI_second,compare_SecondThird_CHEBI,query_compatible_third)

#Removed protein output, query times out.
item1 = "PREFIX ch:<https://identifiers.org/chebi/CHEBI:>
PREFIX cur: <http://vocabularies.wikipathways.org/wp#Curation:>
select distinct ?pathwayRes (str(?wpid) as ?pathway) (str(?title) as ?pathwayTitle) (count(distinct ?chebiMetabolite) AS ?CHEBIsInPWs) (count(distinct ?chebiMetaboliteSecond) AS ?CHEBIsInPWsSecond) (count(distinct ?chebiMetaboliteThird) AS ?CHEBIsInPWsThird) (GROUP_CONCAT(DISTINCT fn:substring(?chebiMetabolite,37);separator=' ') AS ?includedCHEBIs)
where {
VALUES ?chebiMetabolite {"

item2 = "}
VALUES ?chebiMetaboliteSecond {"

item3 = "}
VALUES ?chebiMetaboliteThird {"

item4 = "}
 
 ?datanode	a wp:Metabolite ;          
           	wp:bdbChEBI ?chebiMetabolite ;
    		dcterms:isPartOf ?pathwayRes .

 ?pathwayRes a wp:Pathway ;
             wp:organismName 'Homo sapiens' ; 
    		dcterms:identifier ?wpid ;
    		dc:title ?title .
    		
  #?pathwayRes wp:ontologyTag cur:Reactome_Approved . 
  ?pathwayRes wp:ontologyTag cur:AnalysisCollection .   
  
  ?datanode	a wp:Metabolite ;          
           	wp:bdbChEBI ?chebiMetaboliteSecond ;
    		dcterms:isPartOf ?pathwayRes .
    		
  ?datanode	a wp:Metabolite ;          
           	wp:bdbChEBI ?chebiMetaboliteThird ;
    		dcterms:isPartOf ?pathwayRes .  		
}

ORDER BY DESC(?CHEBIsInPWsThird) "

query_CombinePWs_Third <- paste(item1,string_CHEBI,item2, string_CHEBI_second, item3, string_CHEBI_third, item4)
remove(item1, item2, item3, item4)

results_CombinePWs_Third<- SPARQL(endpointwp,query_CombinePWs_Third,curl_args=list(useragent=R.version.string))
showresults_CombinePWs_Third <- results_CombinePWs_Third$results
remove(query_CombinePWs_Third,results_CombinePWs_Third)

#Print table with third match, first 5 relevant pathways (if there are 5)
##TODO: skip lines below, if there is no relevant third best PW. (patient I for example)
print("Third best matching pathways are:")
if(nrow(showresults_CombinePWs_Third) < 1){
paste("No more pathway models found")
}else if(nrow(showresults_CombinePWs_Third) < 5){
print(showresults_CombinePWs_Third[1:nrow(showresults_CombinePWs_Third),c(2,3,4:7)])
  }else{print(showresults_CombinePWs_Third[1:5,c(2,3,4:7)])}

```

Select 3nd most relevant pathways

```{r}
if(nrow(showresults_CombinePWs_Third) < 1){
  third_selected_pathway = 0
paste("Skipping this step")}else{
third_selected_pathway = 1 ## Note for user: pick a different number here, if another pathway than the first is deemed most relevant (default is top pathway)
}
```

Fourth, find which biomarkers are not covered by top three (or two).

```{r}
#First, obtain all the ChEBI IDs of the third highest ranked pathway, compare these to the list of remaining markers and only retain the marker which weren't covered yet
if(third_selected_pathway < 1){
paste("Using IDs from step 2 iso step 3, relevant pathway models are:")
split_string_CHEBI_Third <- as.list(strsplit(string_CHEBI_third, ' ')[[1]])
 cleaned_string_CHEBI_Third <- as.list(str_replace(split_string_CHEBI_Third, "ch:", ""))
##Using IDs from third step, if no third PWM is selected
string_CHEBI_third_final <- paste(cleaned_string_CHEBI_Third, collapse = ', ')

#Find names for IDs which are present in a pathway model (if two pathways were selected):
CHEBI_missingNames_pathways_third <- glue::glue("CHEBI:{cleaned_string_CHEBI_Third}")
missingNames_pathways_third <- list()
for (j in 1:length(CHEBI_missingNames_pathways_third)){
  for (i in 1:nrow(annotated_both_nozero)){
    if(annotated_both_nozero[i,4] == CHEBI_missingNames_pathways_third[j]){
       missingNames_pathways_third[j] <- annotated_both_nozero[i,3]
      }
    else{next}
  }
}
remove(i,j)
#Save list on one string for reporting purposes
string_missingNames_pathways_third <- do.call(paste, c(as.list(missingNames_pathways_third), sep = ", "))

  }else{
catch_chebis_ThirdPW <- showresults_CombinePWs_Third[third_selected_pathway,7]
split_chebis_ThirdPW <- as.list(strsplit(catch_chebis_ThirdPW, ' ')[[1]])
split_string_CHEBI_Third <- as.list(strsplit(string_CHEBI_third, ' ')[[1]])
cleaned_string_CHEBI_Third <- as.list(str_replace(split_string_CHEBI_Third, "ch:", ""))
##Find IDs still missing after selecting top 3 pathway models.
compare_ThirdFourth_CHEBI <- setdiff(cleaned_string_CHEBI_Third, split_chebis_ThirdPW)
query_compatible_fourth <- as.list(compare_ThirdFourth_CHEBI, sep="")
string_CHEBI_fourth <- paste(query_compatible_fourth, collapse = ', ')

remove(catch_chebis_ThirdPW,split_string_CHEBI_Third,compare_ThirdFourth_CHEBI)

#Find names for IDs which are present in a pathway model (if three pathways were selected):
CHEBI_missingNames_pathways <- glue::glue("CHEBI:{query_compatible_fourth}")
missingNames_pathways <- list()
for (j in 1:length(CHEBI_missingNames_pathways)){
  for (i in 1:nrow(annotated_both_nozero)){
    if(annotated_both_nozero[i,4] == CHEBI_missingNames_pathways[j]){
       missingNames_pathways[j] <- annotated_both_nozero[i,3]
      }
    else{next}
  }
}
remove(i,j)
#Save list on one string for reporting purposes
string_missingNames_pathways <- do.call(paste, c(as.list(missingNames_pathways), sep = ", "))

}

#Print results selected pathways
print(paste0(showresults_CombinePWs[first_selected_pathway,3], " (", showresults_CombinePWs[first_selected_pathway,2], ") [",  showresults_CombinePWs[first_selected_pathway,4], "]"))
print(paste0(showresults_CombinePWs_Second[second_selected_pathway,3], " (", showresults_CombinePWs_Second[second_selected_pathway,2], ") [",  showresults_CombinePWs_Second[second_selected_pathway,4], "]"))

#Print results not selected biomarkers
if(third_selected_pathway == 0){
  print(paste0("These identifiers are not captured in the top 2 pathways: ", string_CHEBI_third_final, ", with the following database names: ", string_missingNames_pathways_third))
}else if(third_selected_pathway == 1){
print(paste0(showresults_CombinePWs_Third[third_selected_pathway,3], " (", showresults_CombinePWs_Third[third_selected_pathway,2], ") [",  showresults_CombinePWs_Third[third_selected_pathway,4], "]"))
  print(paste0("These identifiers are not captured in the top 3 pathways: ", string_CHEBI_fourth, ", with the following database names: ", string_missingNames_pathways))
} else{print(paste0(showresults_CombinePWs_Third[third_selected_pathway,3], " (", showresults_CombinePWs_Third[third_selected_pathway,2], ")# [",  showresults_CombinePWs_Third[third_selected_pathway,4], "]"))
  print(paste0("These identifiers are not captured in the top 3 pathways: ", string_CHEBI_fourth, ", with the following database names: ", string_missingNames_pathways))}

```

# 2.8 Data Visuzalization

## Preparation

### Installing and loading packages

These packages need to be installed (and may take a while to run).

```{r, echo = FALSE}
if(!"rWikiPathways" %in% installed.packages()){
    if (!requireNamespace("BiocManager", quietly = TRUE))
        install.packages("BiocManager")
    BiocManager::install("rWikiPathways", update = FALSE)
}
library(rWikiPathways)

load.libs <- c(
  "RColorBrewer",
  "rWikiPathways",
  "RCy3")
options(install.packages.check.source = "no")
options(install.packages.compile.from.source = "never")
if (!require("pacman")) install.packages("pacman"); library(pacman)
p_load(load.libs, update = TRUE, character.only = TRUE)
status <- sapply(load.libs,require,character.only = TRUE)
if(all(status)){
    print("SUCCESS: You have successfully installed and loaded all required libraries.")
} else{
    cat("ERROR: One or more libraries failed to install correctly. Check the following list for FALSE cases and try again...\n\n")
    status
}

```

### Connect to cytoscape

Open the Cytoscape Program first; this step was tested on version: 3.9.1, running on Java 11.0.11.

```{r}
# Make sure to have Cytoscape started on your computer (>=3.9.1)
cytoscapePing ()
cytoscapeVersionInfo ()
```

### Install WikiPathways apps

For the opening of pathway models and enhanced visualization steps, the WikiPathways app necessary (version 3.3.10).

```{r}
if("WikiPathways" %in% commandsHelp("")) print("Success: the WikiPathways app is installed") else print("Warning: WikiPathways app is not installed. Please install the WikiPathways app before proceeding.")
if(!"WikiPathways" %in% commandsHelp("")){
  installApp("WikiPathways")
}
```

### Open pathway from WikiPathways and Visualize patient data through log.Change

Pathway models can be imported from WikiPathways with the WikiPathways apps (as pathway or network). Enabling the visualization of biomarkers scattered over several pathways, requires finding the optimal combination of two (or three) individual pathways, adding the data to these pathways and merging the results.

Determine the scale for fill color, and convert data to dataframe

```{r}
#Determine highest up/downregulated value:
highest <- max(annotated_both$log.Change)
lowest <- abs(min(annotated_both$log.Change))
if(highest < lowest){colorRange <- lowest} else{colorRange <- highest}

#Change data to dataframe before loading in Cytoscape:
finalData <- as.data.frame(annotated_both)
```

The 2log transformed data can now be visualized on the nodes in the form of a color gradient from blue(downregulated) to white (0) to red (upregulated), just as the reference data.

```{r}
#Select highest relevant pathway ID (containing most biomarkers) based on previous step.
wp= commandsRun(paste0('wikipathways import-as-network id=', showresults_CombinePWs[first_selected_pathway,2])) #pick first PW to show data on.

## Linking metabolic data to ChEBI-ID column
loadTableData(finalData, data.key.column = "ID", table.key.column = "ChEBI")

#Set range of data values for visualisation:
data.values = c(-colorRange,-1,0,1,colorRange)
#display.brewer.all(length(data.values), colorblindFriendly=TRUE, type="div") # div,qual,seq,all
node.colors <- c(rev(brewer.pal(length(data.values), "RdBu")))

#Update WikiPathways pallet with Patient Data:
setNodeColorMapping("log.Change", data.values, node.colors, default.color = "#AAAAAA", style.name = "WikiPathways-As-Network")

#Download the Network as a PNG Figure, and store the figure in the Images folder.
imageURL_Name <- paste0("Images/patient_",patientID,"_network_First")
network1 <- exportImage(imageURL_Name,'PNG')
```

```{r}
#Show the network as output in RMD file
filenameFirst<- paste0(imageURL_Name, '.png')
knitr::include_graphics(filenameFirst)
```
### Selecting the second pathway of choice

```{r}
#Select secondly highest ranked relevant PW (manual for now):
wp = commandsRun(paste0('wikipathways import-as-network id=', showresults_CombinePWs_Second[second_selected_pathway,2]))

loadTableData(finalData, data.key.column = "ID", table.key.column = "ChEBI")

#Update WikiPathways pallet with Patient Data:
setNodeColorMapping("log.Change", data.values, node.colors, default.color = "#AAAAAA", style.name = "WikiPathways-As-Network")

#Download the Network as a PNG Figure, and store the figure in the Images folder.
imageURL_Name <- paste0("Images/patient_",patientID,"_network_Second")
network2 <- exportImage(imageURL_Name,'PNG')

```

```{r}
#Show the network as output in RMD file
filenameSecond <- paste0(imageURL_Name, '.png')
knitr::include_graphics(filenameSecond)
```

### Selecting the third pathway of choice

```{r}
if(third_selected_pathway < 1){
  paste("skip this step, no third pathway selected")}else{

#Select third highest ranked relevant PW (manual for now):
wp= commandsRun(paste0('wikipathways import-as-network id=', showresults_CombinePWs_Third[third_selected_pathway,2]))

loadTableData(finalData, data.key.column = "ID", table.key.column = "ChEBI")

#Update WikiPathways pallet with Patient Data:
setNodeColorMapping("log.Change", data.values, node.colors, default.color = "#AAAAAA", style.name = "WikiPathways-As-Network")

#Download the Network as a PNG Figure, and store the figure in the Images folder.
imageURL_Name <- paste0("Images/patient_",patientID,"_network_Third")
network3 <- exportImage(imageURL_Name,'PNG')

#Show the network as output in RMD file
filenameThird <- paste0(imageURL_Name, '.png')
knitr::include_graphics(filenameThird)

}

```
### Print missing biomarkers in visualization

```{r}
#TODO print which biomarkers are not visualized automatically (now manually selected); also add to final reporting step!
missing_IDs  <- paste0(annotated_both_nozero[5,4], ", " , annotated_both_nozero[10,4])
print(annotated_both_nozero[c(5,10),c(1,3,4,8)], digits = 2)
```

# 2.9 Data Interpretation

### Finally: Print reporting details

```{r}
###Print overview of information for each patient
print(paste("Selected Patient ID is: " , patientID, ", age is between: ", agerange, " old"))
remove(age)
print("Relevant biomarkers are:")
print(annotated_both_nozero[,c(1,3,4,8)], digits = 2)

#Print relevant biomarker information matched to pathways:
print(paste("There are", count_biomarkers ,"biomarkers relevant for patient" , patientID, ", the ChEBI-IDs are", string_CHEBI))
if(length(intersectingCHEBI) == 0 ){print("All Biomarkers are in a pathway!")} else{
  print(paste("These biomarkers (as ChEBI IDs) are not in a pathway:" , string_intersectingCHEBI))}

remove(listMissingBiomarkers, CHEBI_inPWs,supercleaned_CHEBI,string_intersectingCHEBI)

#Print table with first five relevant pathways
#print(showresults_CombinePWs[1:5,c(2,3,4,6)])

###Save relevant information for each patient in dataframe (for Tables in publication)
#table6 <- data.frame(matrix(ncol=4,nrow=0, dimnames=list(NULL, c("Primary.ID", "Secondary.ID", "Tertiary.ID", "Not.covered.biomarkers"))))
##Add patient data to dataframe (note: this overwrites existing patient entries, as opposed to using rbind)
#table6[patientID, ] <- c(showresults_CombinePWs[1,2], showresults_CombinePWs_Second[1,2], showresults_CombinePWs_Third[1,2], missing_IDs)

```
