## load district shapfile
library(rgdal)
distr <- readOGR(dsn = "./data/DSL250-Shp/", layer = "Divsec")
## OGR data source with driver: ESRI Shapefile
## Source: "./data/DSL250-Shp/", layer: "Divsec"
## with 533 features
## It has 9 fields
### solutions thanks to http://gis.stackexchange.com/questions/63577/joining-polygons-in-r
dist.coords <- coordinates(distr)
# Generate IDs for grouping
distr.id <- distr@data$DISTRICT #cut(distr.coords[,1], quantile(distr.coords[,1]), include.lowest=TRUE)
# Merge polygons by ID
library(maptools)
distr.union <- unionSpatialPolygons(distr, distr.id)
## Plotting - option 1
par(mfrow = c(1,2))
plot(distr)
plot(distr.union)
## Plotting - option 2
## plot(distr)
## plot(distr.union, add = TRUE, border = "red", lwd = 2)
# Convert SpatialPolygons to data frame
distr.df <- as(distr, "data.frame")
## small function to find the mode
simpleMode <- function(x) {
return(x[which.max(tabulate(x))])
}
# Aggregate and sum desired data attributes by ID list
distr.df.agg <- aggregate(distr.df[, 5], list(distr.id), FUN = simpleMode)
row.names(distr.df.agg) <- as.character(distr.df.agg$Group.1)
head(distr.df.agg)
## Group.1 x
## AMPARA AMPARA EP
## ANURADHAPURA ANURADHAPURA NC
## BADULLA BADULLA UP
## BATTICALOA BATTICALOA EP
## COLOMBO COLOMBO WP
## GALLE GALLE SP
# Reconvert data frame to SpatialPolygons
distr.shp.agg <- SpatialPolygonsDataFrame(distr.union, distr.df.agg)
# Plotting
#library(gridExtra)
#grid.arrange(spplot(distr, "DISTRICT", main = "Distr: original county area"),
# spplot(distr.shp.agg, "Group.1", main = "Distr: aggregated county area"), ncol = 1)
str(distr.shp.agg@data)
## 'data.frame': 25 obs. of 2 variables:
## $ Group.1: Factor w/ 25 levels "AMPARA","ANURADHAPURA",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ x : Factor w/ 9 levels "CP","EP","NC",..: 2 3 8 2 9 7 9 7 4 9 ...
names(distr.shp.agg@data) <- c("DISTRICT","PROVINCE")
writeOGR(distr.shp.agg, dsn = "./results/", layer = "districts", driver = "ESRI Shapefile", overwrite=TRUE)
## population by sector and district
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_POP_SER_265&conn_path=I2
pop <- read.table(file = "./data/Pop.csv",
header = TRUE,
sep = ",", skip = 0, stringsAsFactors = FALSE)
str(pop)
## 'data.frame': 288 obs. of 7 variables:
## $ Sri.lanka.standard.classification.of.area: chr "Sri Lanka" "Sri Lanka" "Sri Lanka" "Sri Lanka" ...
## $ Sector : chr "All sectors" "All sectors" "All sectors" "Urban" ...
## $ Sex : chr "Both Sexes" "Male" "Female" "Both Sexes" ...
## $ X1971 : chr "12,689,897" "6,531,361" "6,158,536" "-" ...
## $ X1981 : chr "14,846,750" "7,568,253" "7,278,497" "3,192,489" ...
## $ X2001 : chr "16,929,689" "8,425,607" "8,504,082" "2,467,301" ...
## $ X2012 : chr "20,359,439" "9,856,634" "10,502,805" "3,704,470" ...
## select entire population of districts and remove "Sri Lanka"
pop <- pop[!pop$Sri.lanka.standard.classification.of.area=="Sri Lanka" & pop$Sex=="Both Sexes" & pop$Sector=="All sectors",]
names(pop)[1] <- "DISTRICT"
pop <- subset(x = pop, select = c("DISTRICT", "X1971", "X1981", "X2001", "X2012"))
str(pop)
## 'data.frame': 25 obs. of 5 variables:
## $ DISTRICT: chr "Colombo" "Gampaha" "Kalutara" "Kandy" ...
## $ X1971 : chr "2,672,265" "-" "729,514" "1,187,925" ...
## $ X1981 : chr "1,699,241" "1,390,862" "829,704" "1,048,317" ...
## $ X2001 : chr "2,251,274" "2,063,684" "1,066,239" "1,279,028" ...
## $ X2012 : chr "2,324,349" "2,304,833" "1,221,948" "1,375,382" ...
head(pop)
## DISTRICT X1971 X1981 X2001 X2012
## 13 Colombo 2,672,265 1,699,241 2,251,274 2,324,349
## 25 Gampaha - 1,390,862 2,063,684 2,304,833
## 37 Kalutara 729,514 829,704 1,066,239 1,221,948
## 49 Kandy 1,187,925 1,048,317 1,279,028 1,375,382
## 61 Matale 314,841 357,354 441,328 484,531
## 73 Nuwara Eliya 450,278 603,577 703,610 711,644
## fix the numbers
for (i in 2:5) {
pop[,i] <- as.numeric(gsub(pattern = ",",replacement = "", x=pop[,i],fixed = TRUE))
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## sort the census data
pop <- pop[order(pop$DISTRICT),]
##convert text so that it matches the shapefile
pop$DISTRICT <- toupper(pop$DISTRICT)
distr.shp.agg$DISTRICT %in% pop$DISTRICT
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE TRUE
pop$DISTRICT[(distr.shp.agg$DISTRICT %in% pop$DISTRICT)==FALSE]
## [1] "MONERAGALA"
distr.shp.agg$DISTRICT[(distr.shp.agg$DISTRICT %in% pop$DISTRICT)==FALSE]
## [1] MONARAGALA
## 25 Levels: AMPARA ANURADHAPURA BADULLA BATTICALOA COLOMBO ... VAVUNIYA
pop$DISTRICT[(distr.shp.agg$DISTRICT %in% pop$DISTRICT)==FALSE] <- as.character(distr.shp.agg$DISTRICT[(distr.shp.agg$DISTRICT %in% pop$DISTRICT)==FALSE])
library(dplyr)
distr.shp.agg@data <- left_join(distr.shp.agg@data, pop)
## Joining by: "DISTRICT"
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
## plot choropleht map
## --------------------------------------------------
## https://cran.r-project.org/web/packages/tmap/vignettes/tmap-nutshell.html
## https://cran.r-project.org/web/packages/tmap/tmap.pdf
## install.packages("tmap", dependencies = TRUE)
library(tmap)
## quick plot
qtm(distr.shp.agg, fill="X2001")
## detailed plot
tm_shape(distr.shp.agg) +
tm_fill("X2012", textNA="no data", title="Population 2012") +
tm_text("DISTRICT", size= .5) +
tm_borders() +
tm_layout(title = "Population Sri Lanka")
## Plot population development as Graph
## --------------------------------------------------
library(ggplot2)
a <- distr.shp.agg@data
library(reshape)
b <- melt(a, id.vars = c("PROVINCE", "DISTRICT"), measure.vars = c("X1971","X1981","X2001","X2012"))
str(b)
## 'data.frame': 100 obs. of 4 variables:
## $ PROVINCE: Factor w/ 9 levels "CP","EP","NC",..: 2 3 8 2 9 7 9 7 4 9 ...
## $ DISTRICT: chr "AMPARA" "ANURADHAPURA" "BADULLA" "BATTICALOA" ...
## $ variable: Factor w/ 4 levels "X1971","X1981",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 272605 388770 615405 256721 2672265 ...
b$value <- b$value/1000
levels(b$variable) <- c("1971","1981","2001","2012")
ggplot(data = b, aes(x = variable, y = value, group = DISTRICT, col = DISTRICT)) +
geom_line() +
geom_point() +
facet_grid(. ~ PROVINCE) +
labs(title = "Population development Sri Lanka",
x = "Census Year",
y = "Population in thousands") +
guides(col=guide_legend(nrow=5, keyheight=.5, keywidth=.5)) +
theme(legend.position = "bottom",
text = element_text(size=10),
legend.text = element_text(size=8))
## Warning: Removed 4 rows containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).
Exercises
Tasks to perform: - data aggregation - Polygon union - Plotting
Which province has the highest population standard deviation? Calculate and map your results.
Is there a significant difference between the sexes in the different districts? Calculate and map your results.
pop <- read.table(file = "./data/Pop.csv",
header = TRUE,
sep = ",", skip = 0, stringsAsFactors = FALSE)
str(pop)
## 'data.frame': 288 obs. of 7 variables:
## $ Sri.lanka.standard.classification.of.area: chr "Sri Lanka" "Sri Lanka" "Sri Lanka" "Sri Lanka" ...
## $ Sector : chr "All sectors" "All sectors" "All sectors" "Urban" ...
## $ Sex : chr "Both Sexes" "Male" "Female" "Both Sexes" ...
## $ X1971 : chr "12,689,897" "6,531,361" "6,158,536" "-" ...
## $ X1981 : chr "14,846,750" "7,568,253" "7,278,497" "3,192,489" ...
## $ X2001 : chr "16,929,689" "8,425,607" "8,504,082" "2,467,301" ...
## $ X2012 : chr "20,359,439" "9,856,634" "10,502,805" "3,704,470" ...
pop <- pop[!pop$Sri.lanka.standard.classification.of.area=="Sri Lanka" & !pop$Sex=="Both Sexes" & pop$Sector=="All sectors",]
names(pop)[1] <- "DISTRICT"
pop <- subset(x = pop, select = c("DISTRICT", "Sex","X1971", "X1981", "X2001", "X2012"))
str(pop)
## 'data.frame': 50 obs. of 6 variables:
## $ DISTRICT: chr "Colombo" "Colombo" "Gampaha" "Gampaha" ...
## $ Sex : chr "Male" "Female" "Male" "Female" ...
## $ X1971 : chr "1,400,514" "1,271,751" "-" "-" ...
## $ X1981 : chr "894,509" "804,732" "703,008" "687,854" ...
## $ X2001 : chr "1,151,413" "1,099,861" "1,007,702" "1,055,982" ...
## $ X2012 : chr "1,140,472" "1,183,877" "1,116,893" "1,187,940" ...
## fix the numbers
for (i in 3:6) {
pop[,i] <- as.numeric(gsub(pattern = ",",replacement = "", x=pop[,i],fixed = TRUE))
}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
head(pop)
## DISTRICT Sex X1971 X1981 X2001 X2012
## 14 Colombo Male 1400514 894509 1151413 1140472
## 15 Colombo Female 1271751 804732 1099861 1183877
## 26 Gampaha Male NA 703008 1007702 1116893
## 27 Gampaha Female NA 687854 1055982 1187940
## 38 Kalutara Male 367457 413262 527281 591284
## 39 Kalutara Female 362057 416442 538958 630664
## Poverty head count ratio (%) by Province and District (Poverty Head Count Ratio)
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_LSS_POV_101&conn_path=I2
poverty <- read.table(file = "./data/poverty.csv", header = TRUE, sep = ",", skip = 0, stringsAsFactors = FALSE)
str(poverty)
## 'data.frame': 25 obs. of 7 variables:
## $ Province.and.district: chr "Colombo" "Gampaha" "Kalutara" "Kandy" ...
## $ X1991 : chr "16.2" "14.7" "32.3" "35.9" ...
## $ X1996 : chr "12.0" "14.1" "29.5" "36.7" ...
## $ X2002 : chr "6.4" "10.7" "20.0" "24.9" ...
## $ X2007 : chr "5.4" "8.7" "13.0" "17.0" ...
## $ X2010 : chr "3.6" "3.9" "6.0" "10.3" ...
## $ X2013 : num 1.4 2.1 3.1 6.2 7.8 6.6 9.9 7.1 4.9 8.3 ...
str(pop)
## 'data.frame': 50 obs. of 6 variables:
## $ DISTRICT: chr "Colombo" "Colombo" "Gampaha" "Gampaha" ...
## $ Sex : chr "Male" "Female" "Male" "Female" ...
## $ X1971 : num 1400514 1271751 NA NA 367457 ...
## $ X1981 : num 894509 804732 703008 687854 413262 ...
## $ X2001 : num 1151413 1099861 1007702 1055982 527281 ...
## $ X2012 : num 1140472 1183877 1116893 1187940 591284 ...
## sex ratio
pop$X2012[pop$Sex=="Male"]/pop$X2012[pop$Sex=="Female"]
## [1] 0.9633366 0.9401931 0.9375579 0.9113385 0.9313719 0.9166435 0.9213454
## [8] 0.9192682 0.9658187 0.8852600 1.0108246 0.9692792 0.9964071 0.9663243
## [15] 0.9086052 0.9382241 0.9760659 0.9238491 0.9378433 0.9537431 0.9779453
## [22] 0.9255910 0.9880030 0.9724350 0.9113108
## is poverty and sex ratio related?
plot(pop$X2012[pop$Sex=="Male"]/pop$X2012[pop$Sex=="Female"], poverty$X2013)
cor(pop$X2012[pop$Sex=="Male"]/pop$X2012[pop$Sex=="Female"], poverty$X2013)
## [1] 0.397554
## is the distribution of sexes significantly different?
## does do ratio between the sexes change?
str(pop)
## 'data.frame': 50 obs. of 6 variables:
## $ DISTRICT: chr "Colombo" "Colombo" "Gampaha" "Gampaha" ...
## $ Sex : chr "Male" "Female" "Male" "Female" ...
## $ X1971 : num 1400514 1271751 NA NA 367457 ...
## $ X1981 : num 894509 804732 703008 687854 413262 ...
## $ X2001 : num 1151413 1099861 1007702 1055982 527281 ...
## $ X2012 : num 1140472 1183877 1116893 1187940 591284 ...
library(plyr)
a <- pop[pop$DISTRICT=="Colombo",]
a$X1971[2]
## [1] 1271751
sr <- ddply(pop, "DISTRICT", function(x) {
sr1 <- x$X1971[1]/x$X1971[2]
sr2 <- x$X1981[1]/x$X1981[2]
sr3 <- x$X2001[1]/x$X2001[2]
sr4 <- x$X2012[1]/x$X2012[2]
data.frame(sr.1971 = sr1,
sr.1981 = sr2,
sr.2001 = sr3,
sr.2012 = sr4)
})
str(sr)
## 'data.frame': 25 obs. of 5 variables:
## $ DISTRICT: chr "Ampara" "Anuradhapura" "Badulla" "Batticaloa" ...
## $ sr.1971 : num 1.1 1.15 1.05 1.07 1.1 ...
## $ sr.1981 : num 1.1 1.15 1.03 1.03 1.11 ...
## $ sr.2001 : num 1.017 1.042 0.988 NA 1.047 ...
## $ sr.2012 : num 0.938 0.954 0.926 0.909 0.963 ...
library(reshape)
sr2 <- melt(sr, id.vars = c("DISTRICT"), measure.vars = c("sr.1971","sr.1981","sr.2001","sr.2012"))
str(sr2)
## 'data.frame': 100 obs. of 3 variables:
## $ DISTRICT: chr "Ampara" "Anuradhapura" "Badulla" "Batticaloa" ...
## $ variable: Factor w/ 4 levels "sr.1971","sr.1981",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 1.1 1.15 1.05 1.07 1.1 ...
ggplot(sr2, aes(x=variable, y=value, group=DISTRICT, col=DISTRICT)) +
geom_line() +
geom_point() +
labs(title = "Sex-ratio development in Sri Lanka",
x = "Census Year",
y = "Sex ratio (Male/Female)") +
guides(col=guide_legend(nrow=5, keyheight=.5, keywidth=.5)) +
theme(legend.position = "bottom",
text = element_text(size=10),
legend.text = element_text(size=8))
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 11 rows containing missing values (geom_point).
## Join with district data
distr <- readOGR(dsn = "./results/", layer="districts")
## OGR data source with driver: ESRI Shapefile
## Source: "./results/", layer: "districts"
## with 25 features
## It has 2 fields
## sort the census data
sr <- sr[order(sr$DISTRICT),]
##convert text so that it matches the shapefile
sr$DISTRICT <- toupper(sr$DISTRICT)
sr
## DISTRICT sr.1971 sr.1981 sr.2001 sr.2012
## 1 AMPARA 1.0995618 1.0987628 1.0170446 0.9382241
## 2 ANURADHAPURA 1.1530393 1.1484236 1.0420043 0.9537431
## 3 BADULLA 1.0450512 1.0324519 0.9877243 0.9255910
## 4 BATTICALOA 1.0726038 1.0301449 NA 0.9086052
## 5 COLOMBO 1.1012486 1.1115614 1.0468714 0.9633366
## 6 GALLE 0.9771591 0.9497629 0.9473319 0.9213454
## 7 GAMPAHA NA 1.0220308 0.9542795 0.9401931
## 8 HAMBANTOTA 1.0578676 1.0451595 0.9999240 0.9658187
## 9 JAFFNA 0.9951004 0.9876417 NA 0.8852600
## 10 KALUTARA 1.0149148 0.9923639 0.9783341 0.9375579
## 11 KANDY 1.0389117 0.9818493 0.9525297 0.9113385
## 12 KEGALLE 1.0384114 0.9865484 0.9593476 0.9113108
## 13 KILINOCHCHI NA NA NA 0.9663243
## 14 KURUNEGALA 1.0531755 1.0231076 0.9811990 0.9238491
## 15 MANNAR 1.1638013 1.1460316 NA 1.0108246
## 16 MATALE 1.0634487 1.0356600 0.9946307 0.9313719
## 17 MATARA 0.9752672 0.9415824 0.9416612 0.9192682
## 18 MONERAGALA 1.1512639 1.2113100 1.0424396 0.9880030
## 19 MULLAITIVU NA 1.2327673 NA 0.9964071
## 20 NUWARA ELIYA 1.0405224 1.0153831 0.9899261 0.9166435
## 21 POLONNARUWA 1.2485262 1.3105864 1.0969304 0.9779453
## 22 PUTTALAM 1.0602903 1.0360680 0.9866554 0.9378433
## 23 RATNAPURA 1.0885251 1.0763151 1.0198825 0.9724350
## 24 TRINCOMALEE 1.1906530 1.1589513 NA 0.9760659
## 25 VAVUNIYA 1.2420669 1.1385863 NA 0.9692792
distr$DISTRICT %in% sr$DISTRICT
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE TRUE
sr$DISTRICT[(distr$DISTRICT %in% sr$DISTRICT)==FALSE]
## [1] "MONERAGALA"
distr$DISTRICT[(distr$DISTRICT %in% sr$DISTRICT)==FALSE]
## [1] MONARAGALA
## 25 Levels: AMPARA ANURADHAPURA BADULLA BATTICALOA COLOMBO ... VAVUNIYA
sr$DISTRICT[(distr$DISTRICT %in% sr$DISTRICT)==FALSE] <- as.character(distr$DISTRICT[(distr$DISTRICT %in% sr$DISTRICT)==FALSE])
library(dplyr)
distr@data <- left_join(distr@data, sr)
## Joining by: "DISTRICT"
## Warning in left_join_impl(x, y, by$x, by$y): joining character vector and
## factor, coercing into character vector
str(distr@data)
## 'data.frame': 25 obs. of 6 variables:
## $ DISTRICT: chr "AMPARA" "ANURADHAPURA" "BADULLA" "BATTICALOA" ...
## $ PROVINCE: Factor w/ 9 levels "CP","EP","NC",..: 2 3 8 2 9 7 9 7 4 9 ...
## $ sr.1971 : num 1.1 1.15 1.05 1.07 1.1 ...
## $ sr.1981 : num 1.1 1.15 1.03 1.03 1.11 ...
## $ sr.2001 : num 1.017 1.042 0.988 NA 1.047 ...
## $ sr.2012 : num 0.938 0.954 0.926 0.909 0.963 ...
spplot(obj = distr, zcol = c("sr.1971","sr.1981","sr.2001","sr.2012"))
## relation between sex ratio and mean elevation?
## H: the higher the elevation, the more tee production, the more women working
library(raster)
srtm <- raster("./results/srtm.tif")
distr@data$elev <- extract(x = srtm, y = distr, fun=mean, na.rm = TRUE)
## Warning in .local(x, y, ...): Transforming SpatialPolygons to the CRS of
## the Raster
cor(distr@data$elev,distr@data$sr.2012)
## [,1]
## [1,] -0.3166771
## Population by age group, sex, residential sector and district
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_POP_GEO_174&conn_path=I2
## -> Pop 2001
pop2 <- read.table(file = "./data/Pop2.csv", header = TRUE, sep = ",", skip = 0, stringsAsFactors = FALSE)
str(pop2)
## 'data.frame': 67 obs. of 5 variables:
## $ Residential.sector: chr "Residential sector" "Urban" "Urban" "Urban" ...
## $ District : chr "District" "Colombo" "Gampaha" "Kalutara" ...
## $ Sex : chr "Sex" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ X2001 : chr "All ages" "1,229,572" "300,933" "113,188" ...
## $ X2012 : chr "All ages" "1,802,904" "360,221" "109,069" ...
pop2 <- pop2[2:67,]
pop2$X2001 <- as.numeric(gsub(",","",pop2$X2001))
## Warning: NAs introduced by coercion
pop2$X2012 <- as.numeric(gsub(",","",pop2$X2012))
## Warning: NAs introduced by coercion
head(pop2)
## Residential.sector District Sex X2001 X2012
## 2 Urban Colombo Both Sexes 1229572 1802904
## 3 Urban Gampaha Both Sexes 300933 360221
## 4 Urban Kalutara Both Sexes 113188 109069
## 5 Urban Kandy Both Sexes 155987 170544
## 6 Urban Matale Both Sexes 36103 60276
## 7 Urban Nuwara Eliya Both Sexes 43073 40151
str(pop2)
## 'data.frame': 66 obs. of 5 variables:
## $ Residential.sector: chr "Urban" "Urban" "Urban" "Urban" ...
## $ District : chr "Colombo" "Gampaha" "Kalutara" "Kandy" ...
## $ Sex : chr "Both Sexes" "Both Sexes" "Both Sexes" "Both Sexes" ...
## $ X2001 : num 1229572 300933 113188 155987 36103 ...
## $ X2012 : num 1802904 360221 109069 170544 60276 ...
library(ggplot2)
ggplot(pop2, aes(x=District,y=X2001, fill=Residential.sector)) +
geom_bar(stat = "identity") +
coord_flip()
## Warning: Removed 15 rows containing missing values (position_stack).
## Employed population by sex, province and district
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_LFE_EMP_101&conn_path=I2
employed <- read.table(file = "./data/Employed_population.csv", header = TRUE, sep = ",", skip = 0, stringsAsFactors = FALSE)
str(employed)
## 'data.frame': 144 obs. of 4 variables:
## $ Period : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ Province.and.district: chr "Colombo" "Gampaha" "Kalutara" "Kandy" ...
## $ Male : chr "607,310" "619,385" "295,405" "319,442" ...
## $ Female : chr "285,796" "305,149" "150,323" "161,706" ...
employed$Male <- as.numeric(gsub(",","",employed$Male))
employed$Female <- as.numeric(gsub(",","",employed$Female))
names(employed)[2] <- "District"
head(employed)
## Period District Male Female
## 1 2006 Colombo 607310 285796
## 2 2006 Gampaha 619385 305149
## 3 2006 Kalutara 295405 150323
## 4 2006 Kandy 319442 161706
## 5 2006 Matale 104577 48459
## 6 2006 Nuwara Eliya 160899 134758
library(reshape)
employed2 <- melt(employed, id.vars = c("District","Period"),measure.vars = c("Male","Female"))
str(employed2)
## 'data.frame': 288 obs. of 4 variables:
## $ District: chr "Colombo" "Gampaha" "Kalutara" "Kandy" ...
## $ Period : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ variable: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 607310 619385 295405 319442 104577 ...
names(employed2)[3:4] <- c("Sex","Employed")
library(ggplot2)
ggplot(employed2, aes(x=District,y=Employed, fill = Sex)) +
geom_bar(stat="identity") +
coord_flip()
## Unemployment rate (%) by province and district
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_LFE_UNP_101&conn_path=I2
unemployed <- read.table(file = "./data/Unemployment_rate.csv", header = TRUE, sep = ",", skip = 0, stringsAsFactors = FALSE)
str(unemployed)
## 'data.frame': 25 obs. of 8 variables:
## $ Province.and.district: chr "Colombo" "Gampaha" "Kalutara" "Kandy" ...
## $ X2006 : chr "5.6" "5.8" "5.4" "8.3" ...
## $ X2007 : chr "5.3" "4.7" "8.0" "7.6" ...
## $ X2008 : chr "4.1" "3.8" "5.9" "6.4" ...
## $ X2009 : chr "4.4" "4.6" "4.1" "9.7" ...
## $ X2010 : chr "3.3" "4.1" "3.7" "9.6" ...
## $ X2011 : num 2.9 4 3.4 8.1 5 1.9 3.8 6 6.5 3.1 ...
## $ X2012 : num 2.9 3.7 4 7.2 2.8 1.8 2.3 7 5.3 5.1 ...
tmp <- apply(X = unemployed[2:8], MARGIN = 2, FUN = function(x) {as.numeric(gsub(",","",x))})
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
head(tmp)
## X2006 X2007 X2008 X2009 X2010 X2011 X2012
## [1,] 5.6 5.3 4.1 4.4 3.3 2.9 2.9
## [2,] 5.8 4.7 3.8 4.6 4.1 4.0 3.7
## [3,] 5.4 8.0 5.9 4.1 3.7 3.4 4.0
## [4,] 8.3 7.6 6.4 9.7 9.6 8.1 7.2
## [5,] 4.6 3.6 6.1 5.4 5.2 5.0 2.8
## [6,] 5.1 4.1 5.2 2.4 3.0 1.9 1.8
unemployed <- data.frame(DISTRICT = unemployed$Province.and.district, tmp)
str(unemployed)
## 'data.frame': 25 obs. of 8 variables:
## $ DISTRICT: Factor w/ 25 levels "Ampara","Anuradhapura",..: 5 7 10 11 16 20 6 17 8 9 ...
## $ X2006 : num 5.6 5.8 5.4 8.3 4.6 5.1 9.4 8.2 9.4 NA ...
## $ X2007 : num 5.3 4.7 8 7.6 3.6 4.1 7.8 9.2 8.7 NA ...
## $ X2008 : num 4.1 3.8 5.9 6.4 6.1 5.2 6.5 8.9 9.6 NA ...
## $ X2009 : num 4.4 4.6 4.1 9.7 5.4 2.4 8.3 9.8 10.6 NA ...
## $ X2010 : num 3.3 4.1 3.7 9.6 5.2 3 6.4 8.8 8.9 NA ...
## $ X2011 : num 2.9 4 3.4 8.1 5 1.9 3.8 6 6.5 3.1 ...
## $ X2012 : num 2.9 3.7 4 7.2 2.8 1.8 2.3 7 5.3 5.1 ...
library(reshape)
unemployed2 <- melt(unemployed, id.vars = "DISTRICT", measure.vars = names(unemployed[,2:8]))
str(unemployed2)
## 'data.frame': 175 obs. of 3 variables:
## $ DISTRICT: Factor w/ 25 levels "Ampara","Anuradhapura",..: 5 7 10 11 16 20 6 17 8 9 ...
## $ variable: Factor w/ 7 levels "X2006","X2007",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 5.6 5.8 5.4 8.3 4.6 5.1 9.4 8.2 9.4 NA ...
names(unemployed2)[2:3] <- c("Year","Unemployed")
head(unemployed2)
## DISTRICT Year Unemployed
## 1 Colombo X2006 5.6
## 2 Gampaha X2006 5.8
## 3 Kalutara X2006 5.4
## 4 Kandy X2006 8.3
## 5 Matale X2006 4.6
## 6 Nuwara Eliya X2006 5.1
library(ggplot2)
ggplot(unemployed2, aes(x = Year, y = Unemployed, group = DISTRICT, col=DISTRICT)) +
geom_line() +
guides(col=guide_legend(nrow=5, keyheight=.5, keywidth=.5)) +
theme(legend.position = "bottom",
text = element_text(size=10),
legend.text = element_text(size=8))
## Warning: Removed 31 rows containing missing values (geom_path).
## Is there a significant difference in the employment of Male and Female?
## propotion of males employed
str(employed)
## 'data.frame': 144 obs. of 4 variables:
## $ Period : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ District: chr "Colombo" "Gampaha" "Kalutara" "Kandy" ...
## $ Male : num 607310 619385 295405 319442 104577 ...
## $ Female : num 285796 305149 150323 161706 48459 ...
m.e.2012 <- employed2[employed2$Sex=="Male" & employed2$Period==2012,]
m.e.2012
## District Period Sex Employed
## 120 Colombo 2012 Male 604050
## 121 Gampaha 2012 Male 628967
## 122 Kalutara 2012 Male 377978
## 123 Kandy 2012 Male 298396
## 124 Matale 2012 Male 124522
## 125 Nuwara Eliya 2012 Male 165075
## 126 Galle 2012 Male 246552
## 127 Matara 2012 Male 223975
## 128 Hambantota 2012 Male 189204
## 129 Jaffna 2012 Male 140919
## 130 Mannar 2012 Male 26467
## 131 Vavuniya 2012 Male 38331
## 132 Mullaitivu 2012 Male 18527
## 133 Kilinochchi 2012 Male 20821
## 134 Batticaloa 2012 Male 111534
## 135 Ampara 2012 Male 153537
## 136 Trincomalee 2012 Male 98978
## 137 Kurunegala 2012 Male 466534
## 138 Puttalam 2012 Male 229636
## 139 Anuradhapura 2012 Male 216072
## 140 Polonnaruwa 2012 Male 119725
## 141 Badulla 2012 Male 247652
## 142 Moneragala 2012 Male 154025
## 143 Ratnapura 2012 Male 347000
## 144 Kegalle 2012 Male 228611
str(pop)
## 'data.frame': 50 obs. of 6 variables:
## $ DISTRICT: chr "Colombo" "Colombo" "Gampaha" "Gampaha" ...
## $ Sex : chr "Male" "Female" "Male" "Female" ...
## $ X1971 : num 1400514 1271751 NA NA 367457 ...
## $ X1981 : num 894509 804732 703008 687854 413262 ...
## $ X2001 : num 1151413 1099861 1007702 1055982 527281 ...
## $ X2012 : num 1140472 1183877 1116893 1187940 591284 ...
library(reshape)
pop.r <- melt(pop, id.vars = c("DISTRICT", "Sex"), measure.vars = names(pop[3:6]))
str(pop.r)
## 'data.frame': 200 obs. of 4 variables:
## $ DISTRICT: chr "Colombo" "Colombo" "Gampaha" "Gampaha" ...
## $ Sex : chr "Male" "Female" "Male" "Female" ...
## $ variable: Factor w/ 4 levels "X1971","X1981",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 1400514 1271751 NA NA 367457 ...
names(pop.r) <- c("DISTRICT","Sex","Year","Population")
m.2012 <- pop.r[pop.r$Sex=="Male" & pop.r$Year=="X2012",]
m.2012
## DISTRICT Sex Year Population
## 151 Colombo Male X2012 1140472
## 153 Gampaha Male X2012 1116893
## 155 Kalutara Male X2012 591284
## 157 Kandy Male X2012 655791
## 159 Matale Male X2012 233657
## 161 Nuwara Eliya Male X2012 340347
## 163 Galle Male X2012 509902
## 165 Matara Male X2012 389903
## 167 Hambantota Male X2012 294736
## 169 Jaffna Male X2012 274173
## 171 Mannar Male X2012 50053
## 173 Vavuniya Male X2012 84715
## 175 Mullaitivu Male X2012 46036
## 177 Kilinochchi Male X2012 55783
## 179 Batticaloa Male X2012 250676
## 181 Ampara Male X2012 314352
## 183 Trincomalee Male X2012 187472
## 185 Kurunegala Male X2012 777201
## 187 Puttalam Male X2012 368971
## 189 Anuradhapura Male X2012 420100
## 191 Polonnaruwa Male X2012 200780
## 193 Badulla Male X2012 391948
## 195 Moneragala Male X2012 224168
## 197 Ratnapura Male X2012 536401
## 199 Kegalle Male X2012 400820
p.m.e.2012 <- data.frame(DISTRICT = m.e.2012[,1], pme = m.e.2012$Employed/m.2012$Population)
p.m.e.2012
## DISTRICT pme
## 1 Colombo 0.5296491
## 2 Gampaha 0.5631399
## 3 Kalutara 0.6392495
## 4 Kandy 0.4550169
## 5 Matale 0.5329265
## 6 Nuwara Eliya 0.4850197
## 7 Galle 0.4835282
## 8 Matara 0.5744377
## 9 Hambantota 0.6419440
## 10 Jaffna 0.5139784
## 11 Mannar 0.5287795
## 12 Vavuniya 0.4524700
## 13 Mullaitivu 0.4024459
## 14 Kilinochchi 0.3732499
## 15 Batticaloa 0.4449329
## 16 Ampara 0.4884238
## 17 Trincomalee 0.5279615
## 18 Kurunegala 0.6002746
## 19 Puttalam 0.6223687
## 20 Anuradhapura 0.5143347
## 21 Polonnaruwa 0.5962994
## 22 Badulla 0.6318491
## 23 Moneragala 0.6870963
## 24 Ratnapura 0.6469041
## 25 Kegalle 0.5703583
## proportion of females employed
str(employed)
## 'data.frame': 144 obs. of 4 variables:
## $ Period : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ District: chr "Colombo" "Gampaha" "Kalutara" "Kandy" ...
## $ Male : num 607310 619385 295405 319442 104577 ...
## $ Female : num 285796 305149 150323 161706 48459 ...
f.e.2012 <- employed2[employed2$Sex=="Female" & employed2$Period==2012,]
f.e.2012
## District Period Sex Employed
## 264 Colombo 2012 Female 298470
## 265 Gampaha 2012 Female 293426
## 266 Kalutara 2012 Female 166909
## 267 Kandy 2012 Female 156931
## 268 Matale 2012 Female 65560
## 269 Nuwara Eliya 2012 Female 119923
## 270 Galle 2012 Female 122244
## 271 Matara 2012 Female 108574
## 272 Hambantota 2012 Female 84111
## 273 Jaffna 2012 Female 51760
## 274 Mannar 2012 Female 4533
## 275 Vavuniya 2012 Female 16150
## 276 Mullaitivu 2012 Female 5055
## 277 Kilinochchi 2012 Female 4228
## 278 Batticaloa 2012 Female 33179
## 279 Ampara 2012 Female 40636
## 280 Trincomalee 2012 Female 29419
## 281 Kurunegala 2012 Female 234939
## 282 Puttalam 2012 Female 88092
## 283 Anuradhapura 2012 Female 139293
## 284 Polonnaruwa 2012 Female 49146
## 285 Badulla 2012 Female 160484
## 286 Moneragala 2012 Female 78938
## 287 Ratnapura 2012 Female 176466
## 288 Kegalle 2012 Female 123151
f.2012 <- pop.r[pop.r$Sex=="Female" & pop.r$Year=="X2012",]
f.2012
## DISTRICT Sex Year Population
## 152 Colombo Female X2012 1183877
## 154 Gampaha Female X2012 1187940
## 156 Kalutara Female X2012 630664
## 158 Kandy Female X2012 719591
## 160 Matale Female X2012 250874
## 162 Nuwara Eliya Female X2012 371297
## 164 Galle Female X2012 553432
## 166 Matara Female X2012 424145
## 168 Hambantota Female X2012 305167
## 170 Jaffna Female X2012 309709
## 172 Mannar Female X2012 49517
## 174 Vavuniya Female X2012 87400
## 176 Mullaitivu Female X2012 46202
## 178 Kilinochchi Female X2012 57727
## 180 Batticaloa Female X2012 275891
## 182 Ampara Female X2012 335050
## 184 Trincomalee Female X2012 192069
## 186 Kurunegala Female X2012 841264
## 188 Puttalam Female X2012 393425
## 190 Anuradhapura Female X2012 440475
## 192 Polonnaruwa Female X2012 205308
## 194 Badulla Female X2012 423457
## 196 Moneragala Female X2012 226890
## 198 Ratnapura Female X2012 551606
## 200 Kegalle Female X2012 439828
p.f.e.2012 <- data.frame(DISTRICT = f.e.2012[,1], pfe = f.e.2012$Employed/f.2012$Population)
p.f.e.2012
## DISTRICT pfe
## 1 Colombo 0.25211234
## 2 Gampaha 0.24700406
## 3 Kalutara 0.26465598
## 4 Kandy 0.21808361
## 5 Matale 0.26132640
## 6 Nuwara Eliya 0.32298403
## 7 Galle 0.22088351
## 8 Matara 0.25598321
## 9 Hambantota 0.27562286
## 10 Jaffna 0.16712462
## 11 Mannar 0.09154432
## 12 Vavuniya 0.18478261
## 13 Mullaitivu 0.10941085
## 14 Kilinochchi 0.07324129
## 15 Batticaloa 0.12026126
## 16 Ampara 0.12128339
## 17 Trincomalee 0.15316891
## 18 Kurunegala 0.27926905
## 19 Puttalam 0.22391053
## 20 Anuradhapura 0.31623361
## 21 Polonnaruwa 0.23937694
## 22 Badulla 0.37898535
## 23 Moneragala 0.34791309
## 24 Ratnapura 0.31991313
## 25 Kegalle 0.27999809
t.test(p.m.e.2012[,2],p.f.e.2012[,2]) #??
##
## Welch Two Sample t-test
##
## data: p.m.e.2012[, 2] and p.f.e.2012[, 2]
## t = 13.383, df = 47.983, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2645002 0.3580250
## sample estimates:
## mean of x mean of y
## 0.5402655 0.2290029
## Employed population by level of education, sex, province and district
## http://sis.statistics.gov.lk/statHtml/statHtml.do?orgId=144&tblId=DT_LFE_EMP_107&conn_path=I2
une.edu <- read.table(file = "./data/Employed_population_edu.csv", header = TRUE, sep = ",", skip = 0, stringsAsFactors = FALSE)
str(une.edu)
## 'data.frame': 576 obs. of 5 variables:
## $ Period : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ Province.and.district: chr "Colombo" "Colombo" "Colombo" "Colombo" ...
## $ Level.of.education : chr "Below 5" "6 - 10" "GCE O/L" "GCE A/L" ...
## $ Male : chr "67,524" "259,471" "122,423" "157,892" ...
## $ Female : chr "30,200" "94,441" "61,268" "99,888" ...
tmp <- apply(X = une.edu[,4:5], MARGIN = 2, FUN = function(x) {as.numeric(gsub(",","",x))})
head(tmp)
## Male Female
## [1,] 67524 30200
## [2,] 259471 94441
## [3,] 122423 61268
## [4,] 157892 99888
## [5,] 57123 31128
## [6,] 301787 122353
une.edu <- data.frame(YEAR = une.edu$Period,
DISTRICT = une.edu$Province.and.district,
LoE = une.edu$Level.of.education,
tmp)
str(une.edu)
## 'data.frame': 576 obs. of 5 variables:
## $ YEAR : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ DISTRICT: Factor w/ 25 levels "Ampara","Anuradhapura",..: 5 5 5 5 7 7 7 7 10 10 ...
## $ LoE : Factor w/ 4 levels "6 - 10","Below 5",..: 2 1 4 3 2 1 4 3 2 1 ...
## $ Male : num 67524 259471 122423 157892 57123 ...
## $ Female : num 30200 94441 61268 99888 31128 ...
library(reshape)
une.edu2 <- melt(une.edu, id.vars = c("YEAR","DISTRICT","LoE"), measure.vars = c("Male","Female"))
str(une.edu2)
## 'data.frame': 1152 obs. of 5 variables:
## $ YEAR : int 2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
## $ DISTRICT: Factor w/ 25 levels "Ampara","Anuradhapura",..: 5 5 5 5 7 7 7 7 10 10 ...
## $ LoE : Factor w/ 4 levels "6 - 10","Below 5",..: 2 1 4 3 2 1 4 3 2 1 ...
## $ variable: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 67524 259471 122423 157892 57123 ...
names(une.edu2)[4:5] <- c("Sex","num_employed")
une.edu2[1,]
## YEAR DISTRICT LoE Sex num_employed
## 1 2006 Colombo Below 5 Male 67524
ggplot(une.edu2, aes(x=Sex, y=num_employed, fill=Sex)) +
geom_bar(stat="identity") +
facet_grid(LoE ~ DISTRICT)
## is employment rate linked to level of education?
a <- cbind(une.edu2[une.edu2$DISTRICT==levels(une.edu2$DISTRICT)[20] & une.edu2$Sex=="Male" & une.edu2$YEAR==2012,5],
une.edu2[une.edu2$DISTRICT==levels(une.edu2$DISTRICT)[20] & une.edu2$Sex=="Female" & une.edu2$YEAR==2012,5]
)
a
## [,1] [,2]
## [1,] 48006 49088
## [2,] 95228 55045
## [3,] 13180 7523
## [4,] 8661 8267
dimnames(a) <- list("LoE"=levels(une.edu2$LoE),
"Sex"=levels(une.edu2$Sex)
)
a
## Sex
## LoE Male Female
## 6 - 10 48006 49088
## Below 5 95228 55045
## GCE A/L 13180 7523
## GCE O/L 8661 8267
#barplot(a, beside = TRUE)
prop.table(a)
## Sex
## LoE Male Female
## 6 - 10 0.16844329 0.17223981
## Below 5 0.33413568 0.19314171
## GCE A/L 0.04624594 0.02639668
## GCE O/L 0.03038969 0.02900722
chisq.test(a) #??
##
## Pearson's Chi-squared test
##
## data: a
## X-squared = 5291.3, df = 3, p-value < 2.2e-16