#The dataset consists of 100 observations on 14 separate
#variables and is an example of a segmentation study for a business-to-bus
#iness situation, specifically a survey of existing customers 
#of HATCO. Three types of information were collected:

#  First is the perception of HATCO on seven attributes identified
# in past studies as the most influential in the
# choice of suppliers. The respondents, purchasing managers of firms
# buying from HATCO, rated HATCO on each attribute.

# Second are actual purchase outcomes, either the evaluations of each
# respondent's satisfaction with HATCO or the percentage of his or her
# product purchases from HATCO. 

# Third are general characteristics of the
#  purchasing companies (e.g., firm size, industry type)

###################################
dados=read.csv2("dados/HATCO.csv")
dados=dados[,-c(1,ncol(dados))]
numericas=sapply(dados,is.numeric)

head(dados)

# transformar variaveis categoricas em dummies
dadosDummies=sapply(dados[,!numericas],
                    function(xx)
                      model.matrix(~xx-1))
matrizDummies=matrix(NA,nrow(dados),0)
for(ii in 1:length(dadosDummies))
{
  matrizDummies=cbind(matrizDummies,
                      dadosDummies[[ii]])
}
dadosExt=cbind(dados[,numericas],matrizDummies)
dadosExt=scale(dadosExt)

# calcular distancia entre observacoes. Estou
# usando distancia Euclidiana
dados.dist = dist(dadosExt)


#### Hierarquico
cluster=hclust(dados.dist) 
plot(cluster, xlab ="" , sub ="" , ylab ="")

clusters = cutree(cluster ,3)

# descritiva numericas
apply(as.matrix(which(numericas)),
      1,
      function(xx)
        boxplot(dados[,xx]~clusters,main=colnames(dados)[xx])
      )


cores=c("darkblue","darkred","darkgreen")
# descritiva categoricas
apply(as.matrix(which(!numericas)),1,function(xx)
{
  counts=table(dados[,xx],clusters)
  barplot(counts,  legend = rownames(counts),main=colnames(dados)[xx],bty="l",col=cores[1:length(rownames(counts))])
})


# K medias K=3
set.seed(302)
clustersK=kmeans(dadosExt,centers = 3)
table(clustersK$cluster,clusters)

# descritiva numericas
apply(as.matrix(which(numericas)),1,function(xx)boxplot(dados[,xx]~clustersK$cluster,main=colnames(dados)[xx]))


# descritiva categoricas
apply(as.matrix(which(!numericas)),1,function(xx)
{
  counts=table(dados[,xx],clustersK$cluster)
  barplot(counts,  legend = rownames(counts),main=colnames(dados)[xx],bty="l",col=cores[1:length(rownames(counts))])  
})