## Solution to Tasks for L08
rm(list=ls())
##==========================================================
## Task 1
## A biology student wishes to determine the relationship between temperature and heart rate in leopard frog,
## Rana pipiens. He manipulates the temperature in 2° increment ranging from 2 to 18°C and records the heart
## rate at each interval. His data are presented in table rana
Tab = read.table("http://edu.modas.lu/data/txt/rana.txt",sep="\t",row.names=1,header=TRUE,comment.char="")
str(Tab)
plot(Tab,pch=19)
## 1) Build the model and provide the p-value for linear dependency
model = lm(Heart.rate ~ Temperature, data = Tab)
summary(model)
anova(model)
## 2) Provide interval estimation for the slope of the dependency
confint(model)
## 3) Estimate 95% prediction interval for heart rate at 15°
predict(model,data.frame(Temperature=15),int = "pred")
##==========================================================
## Task 2
## The height and arm span of 10 adults males were measured (span).
## Is there a correlation between these two measurements? Carry out an appropriate analysis.
Tab = read.table("http://edu.modas.lu/data/txt/span.txt",sep="\t",header=TRUE)
str(Tab)
plot(Tab,pch=19)
## 1) Determine correlation and its confidence intervals
cor.test(Tab$Height,Tab$Span)
## 2) Perform linear regression analysis
model = lm(Height ~ Span, data = Tab)
summary(model)
##==========================================================
## Task 3
## Data are shown in the Table leukemia for two groups of patients who died of acute myelogenous leukemia.
## Patients were classified into the two groups according to the presence or absence of a morphologic
## characteristic of white cells. Patients termed AG positive were identified by the presence of
## Auer rods and/or significant granulature of the leukemic cells in the bone marrow at diagnosis.
## For AG-negative patients, these factors were absent. Leukemia is a cancer characterized by an
## overproliferation of white blood cells; the higher the white blood count (WBC), the more severe the
## disease. Separately for each morphologic group, AG positive and AG negative:
Tab = read.table("http://edu.modas.lu/data/txt/leukemia.txt",sep="\t",header=TRUE)
str(Tab)
## 1. Draw a scatter diagram with a regression line to show a possible association between the log survival time
## (take the log yourself and use as the dependent variable) and the log WBC (take the log yourself).
Tab$logWBC = log10(Tab$WBC)
Tab$logSurvival = log10(Tab$Survival)
par(mfcol=c(2,2))
plot(Tab$logWBC,Tab$logSurvival,pch=19,main="All")
plot(Tab$logWBC[Tab$AG=="Positive"],Tab$logSurvival[Tab$AG=="Positive"],pch=19,main="Positive")
plot(Tab$logWBC[Tab$AG=="Negative"],Tab$logSurvival[Tab$AG=="Negative"],pch=19,main="Negative")
## 2. Build linear regression and check if a linear model is justified.
## Check the coefficient of determination and provide your interpretation.
## Is there the same effect of WBC for 2 groups?
modelAll = lm(logSurvival ~ logWBC, data = Tab)
modelPos = lm(logSurvival ~ logWBC, data = Tab[Tab$AG=="Positive",])
modelNeg = lm(logSurvival ~ logWBC, data = Tab[Tab$AG=="Negative",])
summary(modelAll)
summary(modelPos)
summary(modelNeg)
## 3. What is the survival time for a patient with 20,000 WBC? Are estimates for different groups different or the same?
## AG-positive case
10^predict(modelPos,data.frame(logWBC=log10(20000)),int = "confidence")
## AG-negative case
mean(Tab$Survival[Tab$AG=="Negative"])
##==========================================================
## Task 4
## Dataset actingenes contains log-ratios of expressions of actin-related genes. Log-ratio means
## the log of ratio expression in a sample and in universal human RNA solution. Samples are coming
## from cancer and healthy tissues of several organs (brain, breast, colon, liver, ovary, and uterus).
Tab = read.table("http://edu.modas.lu/data/txt/actingenes.txt",sep="\t",header=TRUE)
str(Tab)
## 1. Build a linear regression model for expressions (here it means – log ratios) of two genes:
## transcription factor 3 (TCF3) and myosin IX A (MYO9A). Is the relation between these genes
## statistically significant? Provide an equation, describing the model and a numerical proof of your conclusion.
model = lm(MYO9A ~ TCF3, data = Tab)
summary(model)
## plot
plot(Tab$TCF3,Tab$MYO9A,pch=19,col=c(healthy="blue",cancer="red")[Tab$State])
# add the regression line and its confidence interval (95%)
newdata = data.frame(TCF3 = seq(-5,1,0.1))
lines(newdata$TCF3, predict(model,newdata,int = "confidence")[,1],col=1,lwd=2)
lines(newdata$TCF3, predict(model,newdata,int = "confidence")[,2],col=1,lty=2)
lines(newdata$TCF3, predict(model,newdata,int = "confidence")[,3],col=1,lty=2)
## 2. Now repeat the analysis for normal and cancer tissues. Draw the conclusions.
modelCancer = lm(MYO9A ~ TCF3, data = Tab[Tab$State == "cancer",])
summary(modelCancer)
modelHealthy = lm(MYO9A ~ TCF3, data = Tab[Tab$State == "healthy",])
summary(modelHealthy)