Census data

## load district shapfile
library(rgdal)
distr <- readOGR(dsn = "./data/DSL250-Shp/", layer = "Divsec")
## OGR data source with driver: ESRI Shapefile 
## Source: "./data/DSL250-Shp/", layer: "Divsec"
## with 533 features
## It has 9 fields
### solutions thanks to http://gis.stackexchange.com/questions/63577/joining-polygons-in-r
dist.coords <- coordinates(distr)

# Generate IDs for grouping
distr.id <- distr@data$DISTRICT #cut(distr.coords[,1], quantile(distr.coords[,1]), include.lowest=TRUE)

# Merge polygons by ID
library(maptools)
distr.union <- unionSpatialPolygons(distr, distr.id)

## Plotting - option 1
par(mfrow = c(1,2))
plot(distr)
plot(distr.union)

## Plotting - option 2
## plot(distr)
## plot(distr.union, add = TRUE, border = "red", lwd = 2)

# Convert SpatialPolygons to data frame
distr.df <- as(distr, "data.frame")

## small function to find the mode
simpleMode <- function(x) {
    return(x[which.max(tabulate(x))])
    }

# Aggregate and sum desired data attributes by ID list
distr.df.agg <- aggregate(distr.df[, 5], list(distr.id), FUN = simpleMode)
row.names(distr.df.agg) <- as.character(distr.df.agg$Group.1)
head(distr.df.agg)
##                   Group.1  x
## AMPARA             AMPARA EP
## ANURADHAPURA ANURADHAPURA NC
## BADULLA           BADULLA UP
## BATTICALOA     BATTICALOA EP
## COLOMBO           COLOMBO WP
## GALLE               GALLE SP
# Reconvert data frame to SpatialPolygons
distr.shp.agg <- SpatialPolygonsDataFrame(distr.union, distr.df.agg)

# Plotting
#library(gridExtra)
#grid.arrange(spplot(distr, "DISTRICT", main = "Distr: original county area"), 
#             spplot(distr.shp.agg, "Group.1", main = "Distr: aggregated county area"), ncol = 1)

str(distr.shp.agg@data)
## 'data.frame':    25 obs. of  2 variables:
##  $ Group.1: Factor w/ 25 levels "AMPARA","ANURADHAPURA",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ x      : Factor w/ 9 levels "CP","EP","NC",..: 2 3 8 2 9 7 9 7 4 9 ...
names(distr.shp.agg@data) <- c("DISTRICT","PROVINCE")

writeOGR(distr.shp.agg, dsn = "./results/", layer = "districts", driver = "ESRI Shapefile", overwrite=TRUE)
## population by sector and district
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_POP_SER_265&conn_path=I2
pop <- read.table(file = "./data/Pop.csv",
    header = TRUE,
    sep = ",", skip = 0, stringsAsFactors = FALSE)
str(pop)
## 'data.frame':    288 obs. of  7 variables:
##  $ Sri.lanka.standard.classification.of.area: chr  "Sri Lanka" "Sri Lanka" "Sri Lanka" "Sri Lanka" ...
##  $ Sector                                   : chr  "All sectors" "All sectors" "All sectors" "Urban" ...
##  $ Sex                                      : chr  "Both Sexes" "Male" "Female" "Both Sexes" ...
##  $ X1971                                    : chr  "12,689,897" "6,531,361" "6,158,536" "-" ...
##  $ X1981                                    : chr  "14,846,750" "7,568,253" "7,278,497" "3,192,489" ...
##  $ X2001                                    : chr  "16,929,689" "8,425,607" "8,504,082" "2,467,301" ...
##  $ X2012                                    : chr  "20,359,439" "9,856,634" "10,502,805" "3,704,470" ...
## select entire population of districts and remove "Sri Lanka"
pop <- pop[!pop$Sri.lanka.standard.classification.of.area=="Sri Lanka" & pop$Sex=="Both Sexes" & pop$Sector=="All sectors",]
names(pop)[1] <- "DISTRICT"
pop <- subset(x = pop, select = c("DISTRICT", "X1971", "X1981", "X2001", "X2012"))
str(pop)
## 'data.frame':    25 obs. of  5 variables:
##  $ DISTRICT: chr  "Colombo" "Gampaha" "Kalutara" "Kandy" ...
##  $ X1971   : chr  "2,672,265" "-" "729,514" "1,187,925" ...
##  $ X1981   : chr  "1,699,241" "1,390,862" "829,704" "1,048,317" ...
##  $ X2001   : chr  "2,251,274" "2,063,684" "1,066,239" "1,279,028" ...
##  $ X2012   : chr  "2,324,349" "2,304,833" "1,221,948" "1,375,382" ...
head(pop)
##        DISTRICT     X1971     X1981     X2001     X2012
## 13      Colombo 2,672,265 1,699,241 2,251,274 2,324,349
## 25      Gampaha         - 1,390,862 2,063,684 2,304,833
## 37     Kalutara   729,514   829,704 1,066,239 1,221,948
## 49        Kandy 1,187,925 1,048,317 1,279,028 1,375,382
## 61       Matale   314,841   357,354   441,328   484,531
## 73 Nuwara Eliya   450,278   603,577   703,610   711,644
## fix the numbers
for (i in 2:5) {
    pop[,i] <- as.numeric(gsub(pattern = ",",replacement = "", x=pop[,i],fixed = TRUE))
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## sort the census data
pop <- pop[order(pop$DISTRICT),]
##convert text so that it matches the shapefile
pop$DISTRICT <- toupper(pop$DISTRICT)

distr.shp.agg$DISTRICT %in% pop$DISTRICT
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
## [23]  TRUE  TRUE  TRUE
pop$DISTRICT[(distr.shp.agg$DISTRICT %in% pop$DISTRICT)==FALSE]
## [1] "MONERAGALA"
distr.shp.agg$DISTRICT[(distr.shp.agg$DISTRICT %in% pop$DISTRICT)==FALSE]
## [1] MONARAGALA
## 25 Levels: AMPARA ANURADHAPURA BADULLA BATTICALOA COLOMBO ... VAVUNIYA
pop$DISTRICT[(distr.shp.agg$DISTRICT %in% pop$DISTRICT)==FALSE] <- as.character(distr.shp.agg$DISTRICT[(distr.shp.agg$DISTRICT %in% pop$DISTRICT)==FALSE])


library(dplyr)
distr.shp.agg@data <- left_join(distr.shp.agg@data, pop)
## Joining by: "DISTRICT"
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
## plot choropleht map
## --------------------------------------------------
## https://cran.r-project.org/web/packages/tmap/vignettes/tmap-nutshell.html
## https://cran.r-project.org/web/packages/tmap/tmap.pdf
## install.packages("tmap", dependencies = TRUE)
library(tmap)

## quick plot
qtm(distr.shp.agg, fill="X2001")

## detailed plot
tm_shape(distr.shp.agg) +
    tm_fill("X2012", textNA="no data", title="Population 2012") +
    tm_text("DISTRICT", size= .5) +
    tm_borders() +
tm_layout(title = "Population Sri Lanka")

## Plot population development as Graph
## --------------------------------------------------
library(ggplot2)
a <- distr.shp.agg@data

library(reshape)
b <- melt(a, id.vars = c("PROVINCE", "DISTRICT"), measure.vars = c("X1971","X1981","X2001","X2012"))
str(b)
## 'data.frame':    100 obs. of  4 variables:
##  $ PROVINCE: Factor w/ 9 levels "CP","EP","NC",..: 2 3 8 2 9 7 9 7 4 9 ...
##  $ DISTRICT: chr  "AMPARA" "ANURADHAPURA" "BADULLA" "BATTICALOA" ...
##  $ variable: Factor w/ 4 levels "X1971","X1981",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ value   : num  272605 388770 615405 256721 2672265 ...
b$value <- b$value/1000
levels(b$variable) <- c("1971","1981","2001","2012")

ggplot(data = b, aes(x = variable, y = value, group = DISTRICT, col = DISTRICT)) +
    geom_line() +
    geom_point() +
    facet_grid(. ~ PROVINCE) +
    labs(title = "Population development Sri Lanka",
         x = "Census Year",
         y = "Population in thousands") +
    guides(col=guide_legend(nrow=5, keyheight=.5, keywidth=.5)) +
    theme(legend.position = "bottom",
          text = element_text(size=10),
          legend.text = element_text(size=8))
## Warning: Removed 4 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).

Exercises

  1. Create maps for the population of the different provinces of Sri Lanka.

Tasks to perform: - data aggregation - Polygon union - Plotting

  1. Which province has the highest population standard deviation? Calculate and map your results.

  2. Is there a significant difference between the sexes in the different districts? Calculate and map your results.

pop <- read.table(file = "./data/Pop.csv",
    header = TRUE,
    sep = ",", skip = 0, stringsAsFactors = FALSE)
str(pop)
## 'data.frame':    288 obs. of  7 variables:
##  $ Sri.lanka.standard.classification.of.area: chr  "Sri Lanka" "Sri Lanka" "Sri Lanka" "Sri Lanka" ...
##  $ Sector                                   : chr  "All sectors" "All sectors" "All sectors" "Urban" ...
##  $ Sex                                      : chr  "Both Sexes" "Male" "Female" "Both Sexes" ...
##  $ X1971                                    : chr  "12,689,897" "6,531,361" "6,158,536" "-" ...
##  $ X1981                                    : chr  "14,846,750" "7,568,253" "7,278,497" "3,192,489" ...
##  $ X2001                                    : chr  "16,929,689" "8,425,607" "8,504,082" "2,467,301" ...
##  $ X2012                                    : chr  "20,359,439" "9,856,634" "10,502,805" "3,704,470" ...
pop <- pop[!pop$Sri.lanka.standard.classification.of.area=="Sri Lanka" & !pop$Sex=="Both Sexes" & pop$Sector=="All sectors",]
names(pop)[1] <- "DISTRICT"
pop <- subset(x = pop, select = c("DISTRICT", "Sex","X1971", "X1981", "X2001", "X2012"))
str(pop)
## 'data.frame':    50 obs. of  6 variables:
##  $ DISTRICT: chr  "Colombo" "Colombo" "Gampaha" "Gampaha" ...
##  $ Sex     : chr  "Male" "Female" "Male" "Female" ...
##  $ X1971   : chr  "1,400,514" "1,271,751" "-" "-" ...
##  $ X1981   : chr  "894,509" "804,732" "703,008" "687,854" ...
##  $ X2001   : chr  "1,151,413" "1,099,861" "1,007,702" "1,055,982" ...
##  $ X2012   : chr  "1,140,472" "1,183,877" "1,116,893" "1,187,940" ...
## fix the numbers
for (i in 3:6) {
    pop[,i] <- as.numeric(gsub(pattern = ",",replacement = "", x=pop[,i],fixed = TRUE))
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
head(pop)
##    DISTRICT    Sex   X1971  X1981   X2001   X2012
## 14  Colombo   Male 1400514 894509 1151413 1140472
## 15  Colombo Female 1271751 804732 1099861 1183877
## 26  Gampaha   Male      NA 703008 1007702 1116893
## 27  Gampaha Female      NA 687854 1055982 1187940
## 38 Kalutara   Male  367457 413262  527281  591284
## 39 Kalutara Female  362057 416442  538958  630664
##  Poverty head count ratio (%) by Province and District (Poverty Head Count Ratio)
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_LSS_POV_101&conn_path=I2
poverty <- read.table(file = "./data/poverty.csv", header = TRUE, sep = ",", skip = 0, stringsAsFactors = FALSE)
str(poverty)
## 'data.frame':    25 obs. of  7 variables:
##  $ Province.and.district: chr  "Colombo" "Gampaha" "Kalutara" "Kandy" ...
##  $ X1991                : chr  "16.2" "14.7" "32.3" "35.9" ...
##  $ X1996                : chr  "12.0" "14.1" "29.5" "36.7" ...
##  $ X2002                : chr  "6.4" "10.7" "20.0" "24.9" ...
##  $ X2007                : chr  "5.4" "8.7" "13.0" "17.0" ...
##  $ X2010                : chr  "3.6" "3.9" "6.0" "10.3" ...
##  $ X2013                : num  1.4 2.1 3.1 6.2 7.8 6.6 9.9 7.1 4.9 8.3 ...
str(pop)
## 'data.frame':    50 obs. of  6 variables:
##  $ DISTRICT: chr  "Colombo" "Colombo" "Gampaha" "Gampaha" ...
##  $ Sex     : chr  "Male" "Female" "Male" "Female" ...
##  $ X1971   : num  1400514 1271751 NA NA 367457 ...
##  $ X1981   : num  894509 804732 703008 687854 413262 ...
##  $ X2001   : num  1151413 1099861 1007702 1055982 527281 ...
##  $ X2012   : num  1140472 1183877 1116893 1187940 591284 ...
## sex ratio
pop$X2012[pop$Sex=="Male"]/pop$X2012[pop$Sex=="Female"]
##  [1] 0.9633366 0.9401931 0.9375579 0.9113385 0.9313719 0.9166435 0.9213454
##  [8] 0.9192682 0.9658187 0.8852600 1.0108246 0.9692792 0.9964071 0.9663243
## [15] 0.9086052 0.9382241 0.9760659 0.9238491 0.9378433 0.9537431 0.9779453
## [22] 0.9255910 0.9880030 0.9724350 0.9113108
## is poverty and sex ratio related?
plot(pop$X2012[pop$Sex=="Male"]/pop$X2012[pop$Sex=="Female"], poverty$X2013)

cor(pop$X2012[pop$Sex=="Male"]/pop$X2012[pop$Sex=="Female"], poverty$X2013)
## [1] 0.397554
## is the distribution of sexes significantly different?



## does do ratio between the sexes change?
str(pop)
## 'data.frame':    50 obs. of  6 variables:
##  $ DISTRICT: chr  "Colombo" "Colombo" "Gampaha" "Gampaha" ...
##  $ Sex     : chr  "Male" "Female" "Male" "Female" ...
##  $ X1971   : num  1400514 1271751 NA NA 367457 ...
##  $ X1981   : num  894509 804732 703008 687854 413262 ...
##  $ X2001   : num  1151413 1099861 1007702 1055982 527281 ...
##  $ X2012   : num  1140472 1183877 1116893 1187940 591284 ...
library(plyr)
a <- pop[pop$DISTRICT=="Colombo",]
a$X1971[2]
## [1] 1271751
sr <- ddply(pop, "DISTRICT", function(x) {
                sr1 <- x$X1971[1]/x$X1971[2]
                sr2 <- x$X1981[1]/x$X1981[2]
                sr3 <- x$X2001[1]/x$X2001[2]
                sr4 <- x$X2012[1]/x$X2012[2]
                data.frame(sr.1971 = sr1,
                           sr.1981 = sr2,
                           sr.2001 = sr3,
                           sr.2012 = sr4)
            })

str(sr)
## 'data.frame':    25 obs. of  5 variables:
##  $ DISTRICT: chr  "Ampara" "Anuradhapura" "Badulla" "Batticaloa" ...
##  $ sr.1971 : num  1.1 1.15 1.05 1.07 1.1 ...
##  $ sr.1981 : num  1.1 1.15 1.03 1.03 1.11 ...
##  $ sr.2001 : num  1.017 1.042 0.988 NA 1.047 ...
##  $ sr.2012 : num  0.938 0.954 0.926 0.909 0.963 ...
library(reshape)
sr2 <- melt(sr, id.vars = c("DISTRICT"), measure.vars = c("sr.1971","sr.1981","sr.2001","sr.2012"))
str(sr2)
## 'data.frame':    100 obs. of  3 variables:
##  $ DISTRICT: chr  "Ampara" "Anuradhapura" "Badulla" "Batticaloa" ...
##  $ variable: Factor w/ 4 levels "sr.1971","sr.1981",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ value   : num  1.1 1.15 1.05 1.07 1.1 ...
ggplot(sr2, aes(x=variable, y=value, group=DISTRICT, col=DISTRICT)) +
    geom_line() +
        geom_point() +
            labs(title = "Sex-ratio development in Sri Lanka",
         x = "Census Year",
         y = "Sex ratio (Male/Female)") +
    guides(col=guide_legend(nrow=5, keyheight=.5, keywidth=.5)) +
    theme(legend.position = "bottom",
          text = element_text(size=10),
          legend.text = element_text(size=8))
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 11 rows containing missing values (geom_point).

## Join with district data
distr <- readOGR(dsn = "./results/", layer="districts")
## OGR data source with driver: ESRI Shapefile 
## Source: "./results/", layer: "districts"
## with 25 features
## It has 2 fields
## sort the census data
sr <- sr[order(sr$DISTRICT),]
##convert text so that it matches the shapefile
sr$DISTRICT <- toupper(sr$DISTRICT)
sr
##        DISTRICT   sr.1971   sr.1981   sr.2001   sr.2012
## 1        AMPARA 1.0995618 1.0987628 1.0170446 0.9382241
## 2  ANURADHAPURA 1.1530393 1.1484236 1.0420043 0.9537431
## 3       BADULLA 1.0450512 1.0324519 0.9877243 0.9255910
## 4    BATTICALOA 1.0726038 1.0301449        NA 0.9086052
## 5       COLOMBO 1.1012486 1.1115614 1.0468714 0.9633366
## 6         GALLE 0.9771591 0.9497629 0.9473319 0.9213454
## 7       GAMPAHA        NA 1.0220308 0.9542795 0.9401931
## 8    HAMBANTOTA 1.0578676 1.0451595 0.9999240 0.9658187
## 9        JAFFNA 0.9951004 0.9876417        NA 0.8852600
## 10     KALUTARA 1.0149148 0.9923639 0.9783341 0.9375579
## 11        KANDY 1.0389117 0.9818493 0.9525297 0.9113385
## 12      KEGALLE 1.0384114 0.9865484 0.9593476 0.9113108
## 13  KILINOCHCHI        NA        NA        NA 0.9663243
## 14   KURUNEGALA 1.0531755 1.0231076 0.9811990 0.9238491
## 15       MANNAR 1.1638013 1.1460316        NA 1.0108246
## 16       MATALE 1.0634487 1.0356600 0.9946307 0.9313719
## 17       MATARA 0.9752672 0.9415824 0.9416612 0.9192682
## 18   MONERAGALA 1.1512639 1.2113100 1.0424396 0.9880030
## 19   MULLAITIVU        NA 1.2327673        NA 0.9964071
## 20 NUWARA ELIYA 1.0405224 1.0153831 0.9899261 0.9166435
## 21  POLONNARUWA 1.2485262 1.3105864 1.0969304 0.9779453
## 22     PUTTALAM 1.0602903 1.0360680 0.9866554 0.9378433
## 23    RATNAPURA 1.0885251 1.0763151 1.0198825 0.9724350
## 24  TRINCOMALEE 1.1906530 1.1589513        NA 0.9760659
## 25     VAVUNIYA 1.2420669 1.1385863        NA 0.9692792
distr$DISTRICT %in% sr$DISTRICT
##  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
## [23]  TRUE  TRUE  TRUE
sr$DISTRICT[(distr$DISTRICT %in% sr$DISTRICT)==FALSE]
## [1] "MONERAGALA"
distr$DISTRICT[(distr$DISTRICT %in% sr$DISTRICT)==FALSE]
## [1] MONARAGALA
## 25 Levels: AMPARA ANURADHAPURA BADULLA BATTICALOA COLOMBO ... VAVUNIYA
sr$DISTRICT[(distr$DISTRICT %in% sr$DISTRICT)==FALSE] <- as.character(distr$DISTRICT[(distr$DISTRICT %in% sr$DISTRICT)==FALSE])

library(dplyr)
distr@data <- left_join(distr@data, sr)
## Joining by: "DISTRICT"
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
str(distr@data)
## 'data.frame':    25 obs. of  6 variables:
##  $ DISTRICT: chr  "AMPARA" "ANURADHAPURA" "BADULLA" "BATTICALOA" ...
##  $ PROVINCE: Factor w/ 9 levels "CP","EP","NC",..: 2 3 8 2 9 7 9 7 4 9 ...
##  $ sr.1971 : num  1.1 1.15 1.05 1.07 1.1 ...
##  $ sr.1981 : num  1.1 1.15 1.03 1.03 1.11 ...
##  $ sr.2001 : num  1.017 1.042 0.988 NA 1.047 ...
##  $ sr.2012 : num  0.938 0.954 0.926 0.909 0.963 ...
spplot(obj = distr, zcol = c("sr.1971","sr.1981","sr.2001","sr.2012"))

## relation between sex ratio and mean elevation?
## H: the higher the elevation, the more tee production, the more women working
library(raster)
srtm <- raster("./results/srtm.tif")
distr@data$elev <- extract(x = srtm, y = distr, fun=mean, na.rm = TRUE)
## Warning in .local(x, y, ...): Transforming SpatialPolygons to the CRS of
## the Raster
cor(distr@data$elev,distr@data$sr.2012)
##            [,1]
## [1,] -0.3166771
## Population by age group, sex, residential sector and district
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_POP_GEO_174&conn_path=I2
## -> Pop 2001
pop2 <- read.table(file = "./data/Pop2.csv", header = TRUE, sep = ",", skip = 0, stringsAsFactors = FALSE)
str(pop2)
## 'data.frame':    67 obs. of  5 variables:
##  $ Residential.sector: chr  "Residential sector" "Urban" "Urban" "Urban" ...
##  $ District          : chr  "District" "Colombo" "Gampaha" "Kalutara" ...
##  $ Sex               : chr  "Sex" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ X2001             : chr  "All ages" "1,229,572" "300,933" "113,188" ...
##  $ X2012             : chr  "All ages" "1,802,904" "360,221" "109,069" ...
pop2 <- pop2[2:67,]
pop2$X2001 <- as.numeric(gsub(",","",pop2$X2001))
## Warning: NAs introduced by coercion
pop2$X2012 <- as.numeric(gsub(",","",pop2$X2012))
## Warning: NAs introduced by coercion
head(pop2)
##   Residential.sector     District        Sex   X2001   X2012
## 2              Urban      Colombo Both Sexes 1229572 1802904
## 3              Urban      Gampaha Both Sexes  300933  360221
## 4              Urban     Kalutara Both Sexes  113188  109069
## 5              Urban        Kandy Both Sexes  155987  170544
## 6              Urban       Matale Both Sexes   36103   60276
## 7              Urban Nuwara Eliya Both Sexes   43073   40151
str(pop2)
## 'data.frame':    66 obs. of  5 variables:
##  $ Residential.sector: chr  "Urban" "Urban" "Urban" "Urban" ...
##  $ District          : chr  "Colombo" "Gampaha" "Kalutara" "Kandy" ...
##  $ Sex               : chr  "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
##  $ X2001             : num  1229572 300933 113188 155987 36103 ...
##  $ X2012             : num  1802904 360221 109069 170544 60276 ...
library(ggplot2)

ggplot(pop2, aes(x=District,y=X2001, fill=Residential.sector)) +
    geom_bar(stat = "identity") +
        coord_flip()
## Warning: Removed 15 rows containing missing values (position_stack).

## Employed population by sex, province and district
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_LFE_EMP_101&conn_path=I2
employed <- read.table(file = "./data/Employed_population.csv", header = TRUE, sep = ",", skip = 0, stringsAsFactors = FALSE)
str(employed)
## 'data.frame':    144 obs. of  4 variables:
##  $ Period               : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
##  $ Province.and.district: chr  "Colombo" "Gampaha" "Kalutara" "Kandy" ...
##  $ Male                 : chr  "607,310" "619,385" "295,405" "319,442" ...
##  $ Female               : chr  "285,796" "305,149" "150,323" "161,706" ...
employed$Male <- as.numeric(gsub(",","",employed$Male))
employed$Female <- as.numeric(gsub(",","",employed$Female))
names(employed)[2] <- "District"
head(employed)
##   Period     District   Male Female
## 1   2006      Colombo 607310 285796
## 2   2006      Gampaha 619385 305149
## 3   2006     Kalutara 295405 150323
## 4   2006        Kandy 319442 161706
## 5   2006       Matale 104577  48459
## 6   2006 Nuwara Eliya 160899 134758
library(reshape)
employed2 <- melt(employed, id.vars = c("District","Period"),measure.vars = c("Male","Female"))
str(employed2)
## 'data.frame':    288 obs. of  4 variables:
##  $ District: chr  "Colombo" "Gampaha" "Kalutara" "Kandy" ...
##  $ Period  : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
##  $ variable: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
##  $ value   : num  607310 619385 295405 319442 104577 ...
names(employed2)[3:4] <- c("Sex","Employed")

library(ggplot2)

ggplot(employed2, aes(x=District,y=Employed, fill = Sex)) +
    geom_bar(stat="identity") +
        coord_flip()

## Unemployment rate (%) by province and district
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_LFE_UNP_101&conn_path=I2
unemployed <- read.table(file = "./data/Unemployment_rate.csv", header = TRUE, sep = ",", skip = 0, stringsAsFactors = FALSE)
str(unemployed)
## 'data.frame':    25 obs. of  8 variables:
##  $ Province.and.district: chr  "Colombo" "Gampaha" "Kalutara" "Kandy" ...
##  $ X2006                : chr  "5.6" "5.8" "5.4" "8.3" ...
##  $ X2007                : chr  "5.3" "4.7" "8.0" "7.6" ...
##  $ X2008                : chr  "4.1" "3.8" "5.9" "6.4" ...
##  $ X2009                : chr  "4.4" "4.6" "4.1" "9.7" ...
##  $ X2010                : chr  "3.3" "4.1" "3.7" "9.6" ...
##  $ X2011                : num  2.9 4 3.4 8.1 5 1.9 3.8 6 6.5 3.1 ...
##  $ X2012                : num  2.9 3.7 4 7.2 2.8 1.8 2.3 7 5.3 5.1 ...
tmp <- apply(X = unemployed[2:8], MARGIN = 2, FUN = function(x) {as.numeric(gsub(",","",x))})
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
head(tmp)
##      X2006 X2007 X2008 X2009 X2010 X2011 X2012
## [1,]   5.6   5.3   4.1   4.4   3.3   2.9   2.9
## [2,]   5.8   4.7   3.8   4.6   4.1   4.0   3.7
## [3,]   5.4   8.0   5.9   4.1   3.7   3.4   4.0
## [4,]   8.3   7.6   6.4   9.7   9.6   8.1   7.2
## [5,]   4.6   3.6   6.1   5.4   5.2   5.0   2.8
## [6,]   5.1   4.1   5.2   2.4   3.0   1.9   1.8
unemployed <- data.frame(DISTRICT = unemployed$Province.and.district, tmp)
str(unemployed)
## 'data.frame':    25 obs. of  8 variables:
##  $ DISTRICT: Factor w/ 25 levels "Ampara","Anuradhapura",..: 5 7 10 11 16 20 6 17 8 9 ...
##  $ X2006   : num  5.6 5.8 5.4 8.3 4.6 5.1 9.4 8.2 9.4 NA ...
##  $ X2007   : num  5.3 4.7 8 7.6 3.6 4.1 7.8 9.2 8.7 NA ...
##  $ X2008   : num  4.1 3.8 5.9 6.4 6.1 5.2 6.5 8.9 9.6 NA ...
##  $ X2009   : num  4.4 4.6 4.1 9.7 5.4 2.4 8.3 9.8 10.6 NA ...
##  $ X2010   : num  3.3 4.1 3.7 9.6 5.2 3 6.4 8.8 8.9 NA ...
##  $ X2011   : num  2.9 4 3.4 8.1 5 1.9 3.8 6 6.5 3.1 ...
##  $ X2012   : num  2.9 3.7 4 7.2 2.8 1.8 2.3 7 5.3 5.1 ...
library(reshape)
unemployed2 <- melt(unemployed, id.vars = "DISTRICT", measure.vars = names(unemployed[,2:8]))
str(unemployed2)
## 'data.frame':    175 obs. of  3 variables:
##  $ DISTRICT: Factor w/ 25 levels "Ampara","Anuradhapura",..: 5 7 10 11 16 20 6 17 8 9 ...
##  $ variable: Factor w/ 7 levels "X2006","X2007",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ value   : num  5.6 5.8 5.4 8.3 4.6 5.1 9.4 8.2 9.4 NA ...
names(unemployed2)[2:3] <- c("Year","Unemployed")
head(unemployed2)
##       DISTRICT  Year Unemployed
## 1      Colombo X2006        5.6
## 2      Gampaha X2006        5.8
## 3     Kalutara X2006        5.4
## 4        Kandy X2006        8.3
## 5       Matale X2006        4.6
## 6 Nuwara Eliya X2006        5.1
library(ggplot2)

ggplot(unemployed2, aes(x = Year, y = Unemployed, group = DISTRICT, col=DISTRICT)) +
    geom_line() +
        guides(col=guide_legend(nrow=5, keyheight=.5, keywidth=.5)) +
            theme(legend.position = "bottom",
          text = element_text(size=10),
          legend.text = element_text(size=8))
## Warning: Removed 31 rows containing missing values (geom_path).

## Is there a significant difference in the employment of Male and Female?

## propotion of males employed
str(employed)
## 'data.frame':    144 obs. of  4 variables:
##  $ Period  : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
##  $ District: chr  "Colombo" "Gampaha" "Kalutara" "Kandy" ...
##  $ Male    : num  607310 619385 295405 319442 104577 ...
##  $ Female  : num  285796 305149 150323 161706 48459 ...
m.e.2012 <- employed2[employed2$Sex=="Male" & employed2$Period==2012,]
m.e.2012
##         District Period  Sex Employed
## 120      Colombo   2012 Male   604050
## 121      Gampaha   2012 Male   628967
## 122     Kalutara   2012 Male   377978
## 123        Kandy   2012 Male   298396
## 124       Matale   2012 Male   124522
## 125 Nuwara Eliya   2012 Male   165075
## 126        Galle   2012 Male   246552
## 127       Matara   2012 Male   223975
## 128   Hambantota   2012 Male   189204
## 129       Jaffna   2012 Male   140919
## 130       Mannar   2012 Male    26467
## 131     Vavuniya   2012 Male    38331
## 132   Mullaitivu   2012 Male    18527
## 133  Kilinochchi   2012 Male    20821
## 134   Batticaloa   2012 Male   111534
## 135       Ampara   2012 Male   153537
## 136  Trincomalee   2012 Male    98978
## 137   Kurunegala   2012 Male   466534
## 138     Puttalam   2012 Male   229636
## 139 Anuradhapura   2012 Male   216072
## 140  Polonnaruwa   2012 Male   119725
## 141      Badulla   2012 Male   247652
## 142   Moneragala   2012 Male   154025
## 143    Ratnapura   2012 Male   347000
## 144      Kegalle   2012 Male   228611
str(pop)
## 'data.frame':    50 obs. of  6 variables:
##  $ DISTRICT: chr  "Colombo" "Colombo" "Gampaha" "Gampaha" ...
##  $ Sex     : chr  "Male" "Female" "Male" "Female" ...
##  $ X1971   : num  1400514 1271751 NA NA 367457 ...
##  $ X1981   : num  894509 804732 703008 687854 413262 ...
##  $ X2001   : num  1151413 1099861 1007702 1055982 527281 ...
##  $ X2012   : num  1140472 1183877 1116893 1187940 591284 ...
library(reshape)
pop.r <- melt(pop, id.vars = c("DISTRICT", "Sex"), measure.vars = names(pop[3:6]))
str(pop.r)
## 'data.frame':    200 obs. of  4 variables:
##  $ DISTRICT: chr  "Colombo" "Colombo" "Gampaha" "Gampaha" ...
##  $ Sex     : chr  "Male" "Female" "Male" "Female" ...
##  $ variable: Factor w/ 4 levels "X1971","X1981",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ value   : num  1400514 1271751 NA NA 367457 ...
names(pop.r) <- c("DISTRICT","Sex","Year","Population")
m.2012 <- pop.r[pop.r$Sex=="Male" & pop.r$Year=="X2012",]
m.2012
##         DISTRICT  Sex  Year Population
## 151      Colombo Male X2012    1140472
## 153      Gampaha Male X2012    1116893
## 155     Kalutara Male X2012     591284
## 157        Kandy Male X2012     655791
## 159       Matale Male X2012     233657
## 161 Nuwara Eliya Male X2012     340347
## 163        Galle Male X2012     509902
## 165       Matara Male X2012     389903
## 167   Hambantota Male X2012     294736
## 169       Jaffna Male X2012     274173
## 171       Mannar Male X2012      50053
## 173     Vavuniya Male X2012      84715
## 175   Mullaitivu Male X2012      46036
## 177  Kilinochchi Male X2012      55783
## 179   Batticaloa Male X2012     250676
## 181       Ampara Male X2012     314352
## 183  Trincomalee Male X2012     187472
## 185   Kurunegala Male X2012     777201
## 187     Puttalam Male X2012     368971
## 189 Anuradhapura Male X2012     420100
## 191  Polonnaruwa Male X2012     200780
## 193      Badulla Male X2012     391948
## 195   Moneragala Male X2012     224168
## 197    Ratnapura Male X2012     536401
## 199      Kegalle Male X2012     400820
p.m.e.2012 <- data.frame(DISTRICT = m.e.2012[,1], pme = m.e.2012$Employed/m.2012$Population)
p.m.e.2012
##        DISTRICT       pme
## 1       Colombo 0.5296491
## 2       Gampaha 0.5631399
## 3      Kalutara 0.6392495
## 4         Kandy 0.4550169
## 5        Matale 0.5329265
## 6  Nuwara Eliya 0.4850197
## 7         Galle 0.4835282
## 8        Matara 0.5744377
## 9    Hambantota 0.6419440
## 10       Jaffna 0.5139784
## 11       Mannar 0.5287795
## 12     Vavuniya 0.4524700
## 13   Mullaitivu 0.4024459
## 14  Kilinochchi 0.3732499
## 15   Batticaloa 0.4449329
## 16       Ampara 0.4884238
## 17  Trincomalee 0.5279615
## 18   Kurunegala 0.6002746
## 19     Puttalam 0.6223687
## 20 Anuradhapura 0.5143347
## 21  Polonnaruwa 0.5962994
## 22      Badulla 0.6318491
## 23   Moneragala 0.6870963
## 24    Ratnapura 0.6469041
## 25      Kegalle 0.5703583
## proportion of females employed
str(employed)
## 'data.frame':    144 obs. of  4 variables:
##  $ Period  : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
##  $ District: chr  "Colombo" "Gampaha" "Kalutara" "Kandy" ...
##  $ Male    : num  607310 619385 295405 319442 104577 ...
##  $ Female  : num  285796 305149 150323 161706 48459 ...
f.e.2012 <- employed2[employed2$Sex=="Female" & employed2$Period==2012,]
f.e.2012
##         District Period    Sex Employed
## 264      Colombo   2012 Female   298470
## 265      Gampaha   2012 Female   293426
## 266     Kalutara   2012 Female   166909
## 267        Kandy   2012 Female   156931
## 268       Matale   2012 Female    65560
## 269 Nuwara Eliya   2012 Female   119923
## 270        Galle   2012 Female   122244
## 271       Matara   2012 Female   108574
## 272   Hambantota   2012 Female    84111
## 273       Jaffna   2012 Female    51760
## 274       Mannar   2012 Female     4533
## 275     Vavuniya   2012 Female    16150
## 276   Mullaitivu   2012 Female     5055
## 277  Kilinochchi   2012 Female     4228
## 278   Batticaloa   2012 Female    33179
## 279       Ampara   2012 Female    40636
## 280  Trincomalee   2012 Female    29419
## 281   Kurunegala   2012 Female   234939
## 282     Puttalam   2012 Female    88092
## 283 Anuradhapura   2012 Female   139293
## 284  Polonnaruwa   2012 Female    49146
## 285      Badulla   2012 Female   160484
## 286   Moneragala   2012 Female    78938
## 287    Ratnapura   2012 Female   176466
## 288      Kegalle   2012 Female   123151
f.2012 <- pop.r[pop.r$Sex=="Female" & pop.r$Year=="X2012",]
f.2012
##         DISTRICT    Sex  Year Population
## 152      Colombo Female X2012    1183877
## 154      Gampaha Female X2012    1187940
## 156     Kalutara Female X2012     630664
## 158        Kandy Female X2012     719591
## 160       Matale Female X2012     250874
## 162 Nuwara Eliya Female X2012     371297
## 164        Galle Female X2012     553432
## 166       Matara Female X2012     424145
## 168   Hambantota Female X2012     305167
## 170       Jaffna Female X2012     309709
## 172       Mannar Female X2012      49517
## 174     Vavuniya Female X2012      87400
## 176   Mullaitivu Female X2012      46202
## 178  Kilinochchi Female X2012      57727
## 180   Batticaloa Female X2012     275891
## 182       Ampara Female X2012     335050
## 184  Trincomalee Female X2012     192069
## 186   Kurunegala Female X2012     841264
## 188     Puttalam Female X2012     393425
## 190 Anuradhapura Female X2012     440475
## 192  Polonnaruwa Female X2012     205308
## 194      Badulla Female X2012     423457
## 196   Moneragala Female X2012     226890
## 198    Ratnapura Female X2012     551606
## 200      Kegalle Female X2012     439828
p.f.e.2012 <- data.frame(DISTRICT = f.e.2012[,1], pfe = f.e.2012$Employed/f.2012$Population)
p.f.e.2012 
##        DISTRICT        pfe
## 1       Colombo 0.25211234
## 2       Gampaha 0.24700406
## 3      Kalutara 0.26465598
## 4         Kandy 0.21808361
## 5        Matale 0.26132640
## 6  Nuwara Eliya 0.32298403
## 7         Galle 0.22088351
## 8        Matara 0.25598321
## 9    Hambantota 0.27562286
## 10       Jaffna 0.16712462
## 11       Mannar 0.09154432
## 12     Vavuniya 0.18478261
## 13   Mullaitivu 0.10941085
## 14  Kilinochchi 0.07324129
## 15   Batticaloa 0.12026126
## 16       Ampara 0.12128339
## 17  Trincomalee 0.15316891
## 18   Kurunegala 0.27926905
## 19     Puttalam 0.22391053
## 20 Anuradhapura 0.31623361
## 21  Polonnaruwa 0.23937694
## 22      Badulla 0.37898535
## 23   Moneragala 0.34791309
## 24    Ratnapura 0.31991313
## 25      Kegalle 0.27999809
t.test(p.m.e.2012[,2],p.f.e.2012[,2]) #??
## 
##  Welch Two Sample t-test
## 
## data:  p.m.e.2012[, 2] and p.f.e.2012[, 2]
## t = 13.383, df = 47.983, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2645002 0.3580250
## sample estimates:
## mean of x mean of y 
## 0.5402655 0.2290029
## Employed population by level of education, sex, province and district
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_LFE_EMP_107&conn_path=I2
une.edu <- read.table(file = "./data/Employed_population_edu.csv", header = TRUE, sep = ",", skip = 0, stringsAsFactors = FALSE)
str(une.edu)
## 'data.frame':    576 obs. of  5 variables:
##  $ Period               : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
##  $ Province.and.district: chr  "Colombo" "Colombo" "Colombo" "Colombo" ...
##  $ Level.of.education   : chr  "Below 5" "6 - 10" "GCE O/L" "GCE A/L" ...
##  $ Male                 : chr  "67,524" "259,471" "122,423" "157,892" ...
##  $ Female               : chr  "30,200" "94,441" "61,268" "99,888" ...
tmp <- apply(X = une.edu[,4:5], MARGIN = 2, FUN = function(x) {as.numeric(gsub(",","",x))})
head(tmp)
##        Male Female
## [1,]  67524  30200
## [2,] 259471  94441
## [3,] 122423  61268
## [4,] 157892  99888
## [5,]  57123  31128
## [6,] 301787 122353
une.edu <- data.frame(YEAR = une.edu$Period,
                      DISTRICT = une.edu$Province.and.district,
                      LoE = une.edu$Level.of.education,
                      tmp)
str(une.edu)
## 'data.frame':    576 obs. of  5 variables:
##  $ YEAR    : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
##  $ DISTRICT: Factor w/ 25 levels "Ampara","Anuradhapura",..: 5 5 5 5 7 7 7 7 10 10 ...
##  $ LoE     : Factor w/ 4 levels "6 - 10","Below 5",..: 2 1 4 3 2 1 4 3 2 1 ...
##  $ Male    : num  67524 259471 122423 157892 57123 ...
##  $ Female  : num  30200 94441 61268 99888 31128 ...
library(reshape)
une.edu2 <- melt(une.edu, id.vars = c("YEAR","DISTRICT","LoE"), measure.vars = c("Male","Female"))
str(une.edu2)
## 'data.frame':    1152 obs. of  5 variables:
##  $ YEAR    : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
##  $ DISTRICT: Factor w/ 25 levels "Ampara","Anuradhapura",..: 5 5 5 5 7 7 7 7 10 10 ...
##  $ LoE     : Factor w/ 4 levels "6 - 10","Below 5",..: 2 1 4 3 2 1 4 3 2 1 ...
##  $ variable: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
##  $ value   : num  67524 259471 122423 157892 57123 ...
names(une.edu2)[4:5] <- c("Sex","num_employed")
une.edu2[1,]
##   YEAR DISTRICT     LoE  Sex num_employed
## 1 2006  Colombo Below 5 Male        67524
ggplot(une.edu2, aes(x=Sex, y=num_employed, fill=Sex)) +
    geom_bar(stat="identity") +
        facet_grid(LoE ~ DISTRICT)

## is employment rate linked to level of education?
a <- cbind(une.edu2[une.edu2$DISTRICT==levels(une.edu2$DISTRICT)[20] & une.edu2$Sex=="Male" & une.edu2$YEAR==2012,5],
           une.edu2[une.edu2$DISTRICT==levels(une.edu2$DISTRICT)[20] & une.edu2$Sex=="Female" & une.edu2$YEAR==2012,5]
           )
a
##       [,1]  [,2]
## [1,] 48006 49088
## [2,] 95228 55045
## [3,] 13180  7523
## [4,]  8661  8267
dimnames(a) <- list("LoE"=levels(une.edu2$LoE),
                    "Sex"=levels(une.edu2$Sex)
                    )
a
##          Sex
## LoE        Male Female
##   6 - 10  48006  49088
##   Below 5 95228  55045
##   GCE A/L 13180   7523
##   GCE O/L  8661   8267
#barplot(a, beside = TRUE)
prop.table(a)
##          Sex
## LoE             Male     Female
##   6 - 10  0.16844329 0.17223981
##   Below 5 0.33413568 0.19314171
##   GCE A/L 0.04624594 0.02639668
##   GCE O/L 0.03038969 0.02900722
chisq.test(a) #??
## 
##  Pearson's Chi-squared test
## 
## data:  a
## X-squared = 5291.3, df = 3, p-value < 2.2e-16