ANALYSE DES BENCHMARKS FIO SUR GRID5000

library(ggplot2)
library(plyr)
grenoble <- read.csv('./grenoble.csv', header=FALSE,sep = ";",  stringsAsFactors=FALSE)
luxembourg <- read.csv('./luxembourg.csv', header=FALSE,sep = ";",  stringsAsFactors=FALSE)
nancy <- read.csv('./nancy.csv', header=FALSE,sep = ";",  stringsAsFactors=FALSE)
all <- rbind(grenoble,nancy)
all <- rbind(all,luxembourg)

Chargement des données de l’infrasturcture Grid5000

infra <- read.csv('infra.csv', header=FALSE,sep = ";",  stringsAsFactors=FALSE)
names(infra) <- c("Hostname","Model","Size")

Fonction pour ajouter les noms de colonnes, enlever les erreurs, merger avec les données “infra”, formater les unités et ne conserver que les écritures directes sur disque (direct_io=1)

clean_up <- function (df, infra){
names(df) <- c("Hostname","Date","DirectIO","IOengine","IOscheduler","Error","Operation","Jobs","BufferSize","FileSize","Runtime","Bandwidth","BandwidthMin","BandwidthMax","Latency", "LatencyMin", "LatencyMax","IOPS")
df=subset(df,Error=="0")
df=subset(df,DirectIO=="1")
df <- merge(df,infra,by="Hostname")
df$Hostname = sapply(strsplit(df$Hostname, "[.]"), "[", 1)
df$HostModel = paste(df$Hostname, df$Model, sep=" - ")
df$Duration = df$Runtime/1000 # fio outputs runtime in msec, we want to display seconds 
df$Size = df$FileSize/1024/1024
df$Bwi=df$Duration/df$Size
return(df)
}
all = clean_up(all, infra)
sata = subset(all,Model=="SATA" & Jobs=="1")
sata2 = subset(all,Model=="SATA II" & Jobs=="1")
sas = subset(all,Model=="SAS" & Jobs=="1")
griffon = subset(sata2,grepl("^griffon", Hostname))
edel = subset(sata,grepl("^edel", Hostname))
granduc = subset(sas,grepl("^granduc", Hostname))

Checking the duration

plot_duration <- function (df, cluster){
  ggplot(df,aes(x=Size,y=Duration,color=Operation)) + theme_bw() + 
    geom_point(alpha=1/10 )  + 
    facet_wrap( ~ IOscheduler) + 
    ylab("Runtime (sec.)") + 
    xlab("FileSize (Mo)")+ 
    ggtitle(paste("Runtime vs File size for ",cluster));
}
plot_duration(griffon,"GRIFFON (SATA II)")

plot of chunk unnamed-chunk-7

plot_duration(edel,"EDEL (SATA)")

plot of chunk unnamed-chunk-7

plot_duration(granduc,"GRANDUC (SAS)")

plot of chunk unnamed-chunk-7

Checking the latency

plot_latency <- function (df, cluster){
  ggplot(df,aes(x=Size,y=Latency,color=Operation)) + theme_bw() + 
    geom_point(alpha=1/10 )  + 
    facet_wrap( ~ IOscheduler) + 
    ylab("Latency (µsec.)") + 
    xlab("File size (Mo)")+ 
    ggtitle(paste("Latency vs File size for ",cluster));
}
plot_latency(griffon,"GRIFFON (SATA II)")+ ylim(0,1000)
## Warning: Removed 250 rows containing missing values (geom_point).
## Warning: Removed 218 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-9

plot_latency(edel,"EDEL (SATA)")

plot of chunk unnamed-chunk-9

plot_latency(granduc,"GRANDUC (SAS)")

plot of chunk unnamed-chunk-9

Checking the bandwidth

plot_bandwidth <- function (df, cluster){
  ggplot(data=df,aes(x=Size,y=Bwi, color=Operation)) + 
    geom_point(alpha=.5) + 
    facet_wrap( ~ IOscheduler) +
    theme_bw() + geom_smooth(method="lm") + 
    ggtitle(paste("Bandwidth (Duration/Size) for ",cluster)) +
    ylim(0,0.1);
}
plot_bandwidth(griffon,"GRIFFON (SATA II)")
## Warning: Removed 4 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 1 rows containing missing values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-11

plot_bandwidth(edel,"EDEL (SATA)")

plot of chunk unnamed-chunk-11

plot_bandwidth(granduc,"GRANDUC (SAS)")
## Warning: Removed 879 rows containing missing values (stat_smooth).
## Warning: Removed 873 rows containing missing values (stat_smooth).
## Warning: Removed 879 rows containing missing values (geom_point).
## Warning: Removed 873 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-11

Checking the linearity

Bandwidth linearity

plot_bandwidth_linearity <- function (df, cluster){
  ggplot(df,aes(x=Size,y=Bwi,color=Operation)) + 
    theme_bw() + geom_point(alpha=.2) + 
    geom_smooth(method=lm,se=FALSE,fullrange=T) + 
    facet_wrap( ~ IOscheduler) + 
    ylab("Bandwidth") + xlab("File Size (MiB)") +
    ggtitle(paste("Checking the linearity of bandwidth for ",cluster));
}
plot_bandwidth_linearity(griffon,"GRIFFON (SATA II)")+ ylim(0,0.03)
## Warning: Removed 324 rows containing missing values (stat_smooth).
## Warning: Removed 2 rows containing missing values (stat_smooth).
## Warning: Removed 310 rows containing missing values (stat_smooth).
## Warning: Removed 324 rows containing missing values (geom_point).
## Warning: Removed 312 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-13

plot_bandwidth_linearity(edel,"EDEL (SATA)")+ ylim(0,0.01)
## Warning: Removed 243 rows containing missing values (stat_smooth).
## Warning: Removed 1 rows containing missing values (stat_smooth).
## Warning: Removed 250 rows containing missing values (stat_smooth).
## Warning: Removed 243 rows containing missing values (geom_point).
## Warning: Removed 251 rows containing missing values (geom_point).

plot of chunk unnamed-chunk-13

plot_bandwidth_linearity(granduc,"GRANDUC (SAS)") + ylim(0,0.7)

plot of chunk unnamed-chunk-13

IOPS linearity

plot_linearity_iops <- function (df, cluster){
  ggplot(df,aes(x=Size,y=IOPS, colour=Operation)) + theme_bw() + 
    geom_point(alpha=.2) + geom_smooth(method=lm,se=FALSE,fullrange=T) + 
    facet_wrap( ~ IOscheduler) + 
    ylab("IOPS") + xlab("File Size (in MiB)") + 
    ggtitle(paste("Checking the linearity of IOPS for ",cluster));
}
plot_linearity_iops(griffon,"GRIFFON (SATA II)")

plot of chunk unnamed-chunk-15

plot_linearity_iops(edel,"EDEL (SATA)")

plot of chunk unnamed-chunk-15

plot_linearity_iops(granduc,"GRANDUC (SAS)")

plot of chunk unnamed-chunk-15

Bandwidth Variability

plot_variability <- function (df, cluster){
  df_read<-subset(df,Operation=="read")
  df_write<-subset(df,Operation=="write")
  hist(df_read$Bwi, freq=FALSE, xlab="Bandwidth",
       breaks=20,
       main=paste("Variability of Bwi for read operations for",cluster), 
       col="lightgreen")
  curve(dnorm(x, mean=mean(df_read$Bwi), sd=sd(df_read$Bwi)), add=TRUE, col="darkblue", lwd=2)
  
  hist(df_write$Bwi, freq=FALSE, xlab="Bandwidth",
       breaks=20,
       main=paste("Variability of Bwi for write operations for",cluster), 
       col="lightgreen")
  curve(dnorm(x, mean=mean(df_write$Bwi), sd=sd(df_write$Bwi)), add=TRUE, col="darkblue", lwd=2) 
  
}
plot_variability(griffon,"GRIFFON (SATA II)")

plot of chunk unnamed-chunk-17plot of chunk unnamed-chunk-17

plot_variability(edel,"EDEL (SATA)")

plot of chunk unnamed-chunk-17plot of chunk unnamed-chunk-17

plot_variability(granduc,"GRANDUC (SAS)")

plot of chunk unnamed-chunk-17plot of chunk unnamed-chunk-17

Concurrent operations

sata_c = subset(all,Model=="SATA" & Jobs>1)
sata2_c = subset(all,Model=="SATA II" & Jobs>1)
sas_c = subset(all,Model=="SAS" & Jobs>1)
griffon_c = subset(sata2_c,grepl("^griffon", Hostname))
edel_c = subset(sata_c,grepl("^edel", Hostname))
granduc_c = subset(sas_c,grepl("^granduc", Hostname))

Bandwidth evolution

plot_crw <- function (df, title){
ggplot(df,aes(x=factor(Jobs),y=Bwi, color=Operation)) + theme_bw() + 
  geom_point(alpha=1/10 )  + geom_line() + facet_wrap(Size ~ IOscheduler) + 
  ylab("Bandwidth") + xlab("Jobs")+ ggtitle(title)+ geom_boxplot();
}
plot_crw(sata_c,"SATA CONCURRENT OPERATIONS")

plot of chunk unnamed-chunk-20

plot_crw(sata2_c,"SATA II CONCURRENT OPERATIONS")

plot of chunk unnamed-chunk-20

plot_crw(sas_c,"SAS CONCURRENT OPERATIONS")

plot of chunk unnamed-chunk-20

Checking the influence of the number of concurrent operations on aggregate bandwidth

plot_aggregate <- function (df, cluster){
  df$TotalSize=df$FileSize * df$Jobs
  df$BW = (df$TotalSize) / df$Runtime
  ggplot(data=df,aes(x=Jobs,y=BW, color=Operation)) + theme_bw() +
    geom_point(alpha=.5) + facet_wrap( ~ IOscheduler) +
    geom_smooth(method=lm,se=FALSE,fullrange=T) + 
    xlab("Number of concurrent operations") + ylab("Aggregated Bandwidth (MiB/s)") +
    ggtitle(paste("Influence of the number of concurrent operations on aggregate bandwidth \n on ",cluster));
}
plot_aggregate(griffon_c,"GRIFFON (SATA II)")

plot of chunk unnamed-chunk-22

plot_aggregate(edel_c,"EDEL (SATA)")

plot of chunk unnamed-chunk-22

plot_aggregate(granduc_c,"GRANDUC (SAS)")

plot of chunk unnamed-chunk-22