Read in libraries and hummingbird data

#Load required libraries
library(reshape2)
require(ggplot2)
library(picante)
library(lme4)
library(ape)
library(doSNOW)
library(phytools)
library(maptools)
library(knitr)
library(raster)
library(gridExtra)
library(phytools)
library(stringr)
library(dplyr)
library(scales)

opts_chunk$set(warning=FALSE,message=FALSE)
opts_chunk$set(fig.path='figure/',fig.width=10,echo=TRUE)

#Set git path
gitpath<-"C:/Users/Ben/Documents/Pred_Obs/"
droppath<-"C:/Users/Ben/Dropbox/"

setwd(gitpath)

The analysis takes in:

Phylogenetic Tree in newick format
Sites matrix with long lat
Species Assemblages corresponding to the sites matrix

#Bring in Phylogenetic Data
trx<-read.tree(paste(gitpath,"InputData\\hum294.tre",sep=""))

#format tips
new<-str_extract(trx$tip.label,"(\\w+).(\\w+)")
#get duplicates
trx<-drop.tip(trx,trx$tip.label[duplicated(new)])

#name tips.
trx$tip.label<-str_extract(trx$tip.label,"(\\w+).(\\w+)")

ctrx<-cophenetic(trx)

#Bring in the assemblages, cleaned from JP
Sites<-read.csv(paste(gitpath,"InputData//Sites.csv",sep=""),row.names=1)

####bring in Clade data
clades<-read.csv(paste(gitpath,"InputData//CladeList.txt",sep=""),header=FALSE)[,-1]
colnames(clades)<-c("Clade","Genus","Species","double","English")
clades<-clades[,1:5]

#Change the syntax on clades so it matches output, replace . with space
clades$double.<-sub(" ",".",clades$double)

#Bring in spatial data for the sites
#Extract env information for each site
Sites.sp<-SpatialPointsDataFrame(Sites[,c("LongDecDeg","LatDecDeg")],Sites)

#site by species lists that was aggregate for the SDM dataset in SpeciesOverlap.RmD
siteXspp<-read.csv("C:/Users/Ben/Dropbox/Thesis/Pred_Realized/Assemblages//SiteXsppraster.csv",row.names=1)

##Elev raster
elev.ca<-raster("InputData/EcuadorRaster.grd")

#raster template for cell numbers
r<-raster("InputData/RasterTemplate.grd")

Following Pigot’s 2013 paper here, we can create reasonable expectation for the degree of sympatry versus time since speciation.

We will modify this to look at co-occurrence of sister pairs as a function of time since speciation.

Find sister pairs

plotTree(trx,fsize=.1)

sisters<-sapply(trx$tip.label,function(x){
  getSisters(trx,mode="label",node=x)})

names(sisters)<-trx$tip.label
sisters<-melt(sisters)
colnames(sisters)<-c("Sister","Node")

Time since divergence for sister pairs

sisters$Time<-NULL
for (x in 1:nrow(sisters)){
  row<-sisters[x,]
  distp<-ctrx[rownames(ctrx) %in% row[[1]],colnames(ctrx) %in% row[[2]]]
  if(length(distp)==0){
    sisters[x,"Time"]<-NA
    next}
  sisters[x,"Time"]<-distp
}

#remove duplicates
sisters<-sisters[!duplicated(sisters$Time),]

Co-occurrence Rates

How many times do sister pairs co-occur?

sisters$Co<-NA
for (x in 1:nrow(sisters)){
  row<-sisters[x,]
  pairs<-siteXspp[rownames(siteXspp) %in% c(row[[1]],row[[2]]),]
  
  #If we are missing a species from the list, skip
  if(nrow(pairs)==2) {
  co_occur<-sum(apply(pairs,2,sum)==2)
  sisters[x,"Co"]<-co_occur}
}

Co-occurrence and Time since Divergence

Probability of co-occurrence of sister pairs as a function of time since divergence, getting closer to the model proposed by Pigot.

sisters$PCo<-(sisters$Co>0) * 1
ggplot(sisters,aes(x=Time,y=PCo)) + geom_point() + theme_bw() + geom_smooth(method="glm",family="binomial") + labs(x="Time since divergence between sister taxa",y="Probability of co-occurrence")

This seems quite sensitive to the shape of the phylogeny. If we ignore just the furthest tip (co-occurrence among deep splitting topazes), what happens?

sisters$PCo<-(sisters$Co>0) * 1

In the Pigot 2013 Ecology Letters paper, the relationship is plotted as the proportion of species in sympatry. For this we need to bin time intervals, let’s round time since divergence to two decimal places.

#round to two decimal places
sisters$RTime<-round(sisters$Time)

For each time interval, what % of species are sympatric?

In the 2013 paper, Pigot combines time intervals with less than 4 species

#remove species for which we have no information
sis<-sisters[!is.na(sisters$PCo),]
#remove na for divergence time
sis<-sis[!is.na(sis$Time),]

#how many sister pairs do we have left?
nrow(sis)

## [1] 29

pigot<-sis %>% group_by(RTime) %>% summarize(n=n(),pairs=sum(PCo,na.rm=T)) %>% mutate(p=pairs/n)

Cut the sister lineages into class based on time since speciation

sis$RTime_cut<-cut(sis$RTime,c(seq(0,8,1),10,15,30),right=F)

pigot<-sis %>% group_by(RTime_cut) %>% summarize(n=n(),pairs=sum(PCo,na.rm=T)) %>% mutate(p=pairs/n)

ggplot(pigot,aes(x=RTime_cut,y=p,size=n)) + geom_point(alpha=.5) + geom_smooth(method="glm",aes(group=1),family="binomial",se=F) + ylab("% Pairs Sympatric") + xlab("Time Since Speciation") + theme_bw() + labs(size="Species Pairs")

Time since divergence and cost distance.

Geographic weighted cost distance. If dispersal limitation was an important factor, we would expect recently diverged taxa to be seperated by large geographic barriers compared to deeper splits.

#read in cost matrix between all sites from SpeciesOverlap.Rmd
cp<-read.table("CostMatrix.txt")

#make 0 (instead of infinite) cost from site to itself
cp<-as.matrix(cp)
diag(cp)<-0

rownames(cp)<-colnames(siteXspp)
colnames(cp)<-colnames(siteXspp)

View cost paths between sister taxa

For each species pair, get the cost distance between sites, with cost of co-occurrence being 0.

costsis<-list()
for (x in 1:nrow(sis)){
  A<-sis[x,"Sister"]
  B<-sis[x,"Node"]

  pairs<-siteXspp[rownames(siteXspp) %in% c(A,B),]
  
  #remove sites where neither occur
  pairs<-pairs[,apply(pairs,2,sum) > 0]
  
  #pres for each species
  presA<- colnames(pairs)[which(pairs[A,]==1)]
  presB<- colnames(pairs)[which(pairs[B,]==1)]  
  
  #Presence locations for species A and species B in cost distance matrix
  spdist<-cp[presA, presB,drop=F]
  
  #minimum distance
  #The easist way to preserve naming is to maintain the matrix structure, but set all non min values to NA
    for (g in 1:nrow(spdist)){
      min.col<-min(spdist[g,])
      #colnames to set to NA
      nacol<-as.numeric(which(spdist[g,] > min.col))
      spdist[g,nacol]<-NA
    }
  
 
  #create a dataframe
  df<-data.frame(To=A,From=B,Time=sis[x,"Time"],melt(spdist))
  
  #need to convert cells to characters
  df$Var1<-as.character(df$Var1)
  df$Var2<-as.character(df$Var2)
  
  costsis[[x]]<-df
}

#bind all dataframes together
costdf<-rbind_all(costsis)
colnames(costdf)<-c("To","From","Time","CellTo","CellFrom","CostDistance")

#remove the NAs
costdf<-costdf[!is.na(costdf$CostDistance),]

Plots

#turn to a numeric value
costdf$CellTo<-as.numeric(str_extract(costdf$CellTo,"\\d+"))
costdf$CellFrom<-as.numeric(str_extract(costdf$CellFrom,"\\d+"))

#remove infinites (islands or holes in DEM)
costdf<-costdf[is.finite(costdf$CostDistance),]

#label pairs
costdf$pairs<-as.factor(as.numeric(as.factor(paste(costdf$To,costdf$From))))

sampl<-costdf[costdf$pairs==1,]

makePlot<-function(sampl){
  
  paths<-foreach(x=1:nrow(sampl),.packages=c("raster","gdistance","stringr"),.errorhandling = 'pass',.export=c("elev.ca","r")) %dopar% {
    orig<-xyFromCell(r,cell=sampl$CellTo[x],spatial=T)
    dest<-xyFromCell(r,cell=sampl$CellFrom[x],spatial=T)
    
    if(sampl$CellTo[x]==sampl$CellFrom[x]){return(NA)}
    
    #What elevation is the origin
    elev_origin<-extract(elev.ca,orig)[[1]]
    if(is.na(elev_origin)){ elev_origin<-0}
    
    #Get the difference between the origin elevation and every cell in the raster
    elev_diff<-abs(elev_origin-elev.ca)
    
    #create a the transition matrix where permeability is high where elev difference is low
    trans_diff<-transition(elev_diff,function(x) 1/mean(x),8)
    
    #Diagonal Cell Correction, diagonals are father away than adjacent cells. 
    slope <- geoCorrection(trans_diff)
  
    #shortest path
    l<-shortestPath(slope,orig,dest,output="SpatialLines")
    return(l)
  }
  
  #plot raster and lines
  orig<-xyFromCell(r,cell=sampl$CellTo,spatial=T)
  dest<-xyFromCell(r,cell=sampl$CellFrom,spatial=T)
  
  op <- par(mar=rep(.01, 4))
  plot(elev.ca,axes=F,legend=F,box=F,ext=extent(rbind.SpatialPoints(orig,dest))*2.5)
  par(op)
  
  points(orig,col="red",cex=2.5,pch=20)
  points(dest,col="blue",cex=1.5,pch=20)
  
  #add a point legend
  l <- legend( "top"
         , inset = c(0.01,0.01) 
         , cex = .8
         , bty = "n"
         , legend = c(unique(sampl$To), unique(sampl$From))
         , pt.bg = c("red","blue")
         , pch = 21
)
  
  for(x in 1:length(paths)){
    lines(paths[[x]])
  }
  
  #write to file
  filnam<-paste("CostFigures/",str_replace_all(paste(unique(sampl$To),unique(sampl$From),sep="_"),"\\.+",""),".svg",sep="")
  
  svg(filnam)
  op <- par(mar=rep(.01, 4))
  plot(elev.ca,axes=F,legend=F,box=F,ext=extent(rbind.SpatialPoints(orig,dest))*2.5,col=grey.colors(100))
  par(op)
  
  points(orig,col="black",cex=2,pch=1)
  points(dest,col="black",cex=1,pch=17)
  
  #add a point legend
  l <- legend( "top"
         , inset = c(0,0) 
         , cex = .8
         , bty = "n"
         , legend = c(unique(sampl$To), unique(sampl$From))
         , col = c("black","black")
         , pch = c(1,17)
)
  for(x in 1:length(paths)){
    lines(paths[[x]],lwd=2)
  }
  
  invisible(out<-dev.off())
}

Run for each sister pair.

splitdf<-split(costdf,costdf$pairs)

  cl<-makeCluster(20,"SOCK")
  registerDoSNOW(cl)
  
lapply(splitdf,makePlot)


##   2

    stopCluster(cl)

Color each sister pair to see the spread.

ggplot(costdf,aes(x=Time,y=CostDistance,col=pairs)) + geom_point() + geom_smooth(method="glm",aes(group=1),family="poisson") + theme_bw() + xlab("Time since divergence between sister taxa") + scale_color_discrete(guide="none") + ylab("Environmentally Weighted Geographic Distance")

This is caused by the co-occurrence of topazes - plot both lines

#label pairs
ggplot(costdf,aes(x=Time,y=CostDistance,col=pairs)) + geom_point() + geom_smooth(method="glm",aes(group=1),family="poisson") + theme_bw() + xlab("Time since divergence between sister taxa (myr)") + scale_color_discrete(guide="none") + ylab("Environmentally Weighted Geographic Distance") + geom_smooth(data=costdf[costdf$Time<20,],aes(group=1),linetype="dashed",method="glm",family="poisson",col="black",size=1.1)

Is that negative relationship is meaningful - Fit a regression line with a random effect for species pair.

mod<-lmer(data=costdf,CostDistance~Time + (1|pairs))
summary(mod)

## Linear mixed model fit by REML ['lmerMod']
## Formula: CostDistance ~ Time + (1 | pairs)
##    Data: costdf
## 
## REML criterion at convergence: 2557.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.6890 -0.0105  0.1418  0.3487  1.9451 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  pairs    (Intercept)  7.994   2.827   
##  Residual             22.267   4.719   
## Number of obs: 423, groups:  pairs, 29
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  18.2233     0.9231  19.742
## Time         -0.3708     0.1048  -3.539
## 
## Correlation of Fixed Effects:
##      (Intr)
## Time -0.738

#assuming normal approximation
# extract coefficients
coefs <- data.frame(coef(summary(mod)))
# use normal distribution to approximate p-value
coefs$p.z <- 2 * (1 - pnorm(abs(coefs$t.value)))
coefs

##               Estimate Std..Error   t.value          p.z
## (Intercept) 18.2232907  0.9230687 19.742075 0.0000000000
## Time        -0.3708189  0.1047933 -3.538573 0.0004022957

Time since divergence between taxa is weakly useful predictor for the cost distance between species sites. If dispersal limitation through a time lag was a important factor shaping co-occurrence, we would expect to see the weight of the sympatry to be stronger, because it could be the spread of points.

How would the relationship look just with the mean cost distance between sister pairs?

meanC<-costdf %>% group_by(pairs,Time) %>% summarise(cp=mean(CostDistance))

#label pairs
p<-ggplot(meanC,aes(x=Time,y=cp)) + geom_point(size=4,alpha=.8,shape=1) 

#highlight two sample pairs
phaethornis<-as.numeric(unique(costdf[costdf$To %in% c("Phaethornis.guy",'Phaethornis.yaruqui'),"pairs"]))

lesbia<-as.numeric(unique(costdf[costdf$To %in% c("Lesbia.nuna",'Lesbia.victoriae'),"pairs"]))

#label callouts
p<-p + geom_point(data=meanC[meanC$pairs %in% phaethornis,],col="black",size=4,alpha=.8)

p<-p + geom_point(data=meanC[meanC$pairs %in% lesbia,],col="black",size=4,alpha=.8)

#add theme
p + scale_shape_discrete(solid=F)+ geom_smooth(method="lm",aes(group=1),col="black") + theme_bw() + xlab("Time since divergence between sister taxa (myr)") + scale_color_discrete(guide="none") + ylab("Average cost distance between localities") + geom_smooth(data=meanC[meanC$Time<10,],method="lm",aes(group=1),linetype="dashed",col="black")

ggsave("CostDivergence.svg",height=5,width=7)
ggsave("CostDivergence.jpeg",height=5,width=7,dpi=600)

Is the relationship significant for mean distance?

mod<-lm(data=meanC,cp~Time)
summary(mod)

## 
## Call:
## lm(formula = cp ~ Time, data = meanC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.1104 -0.7725  0.7806  1.4929  3.9444 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18.0932     0.8323  21.738  < 2e-16 ***
## Time         -0.3655     0.1003  -3.646  0.00112 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.117 on 27 degrees of freedom
## Multiple R-squared:  0.3299, Adjusted R-squared:  0.3051 
## F-statistic: 13.29 on 1 and 27 DF,  p-value: 0.001121

#assuming normal approximation
# extract coefficients
coefs <- data.frame(coef(summary(mod)))
# use normal distribution to approximate p-value
coefs$p.z <- 2 * (1 - pnorm(abs(coefs$t.value)))
coefs

##              Estimate Std..Error   t.value     Pr...t..          p.z
## (Intercept) 18.093215  0.8323403 21.737762 1.219770e-18 0.0000000000
## Time        -0.365505  0.1002579 -3.645647 1.121076e-03 0.0002667196

Ignoring the deepest lineages.

mod<-lm(data=meanC[meanC$Time < 10,],cp~Time)
summary(mod)

## 
## Call:
## lm(formula = cp ~ Time, data = meanC[meanC$Time < 10, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4850  0.0716  0.9848  1.7603  3.1346 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  17.0112     1.0967  15.511 5.22e-14 ***
## Time         -0.1038     0.2089  -0.497    0.624    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.063 on 24 degrees of freedom
## Multiple R-squared:  0.01019,    Adjusted R-squared:  -0.03105 
## F-statistic: 0.247 on 1 and 24 DF,  p-value: 0.6237

#assuming normal approximation
# extract coefficients
coefs <- data.frame(coef(summary(mod)))
# use normal distribution to approximate p-value
coefs$p.z <- 2 * (1 - pnorm(abs(coefs$t.value)))
coefs

##               Estimate Std..Error    t.value     Pr...t..       p.z
## (Intercept) 17.0112292  1.0966989 15.5113025 5.221003e-14 0.0000000
## Time        -0.1038227  0.2088887 -0.4970238 6.236948e-01 0.6191722

Time to Sympatry

Ben Weinstein

August 17, 2015