Loading libraries and Data

Parsed with column specification:
cols(
  .default = col_character(),
  `If you left your current role now, how easy would it be for you to find a new one?` = col_integer(),
  `How would you rate your bargaining skills in negotiations with a potential employer about a new contract?` = col_integer()
)
See spec(...) for full column specifications.

Cleanup

We rename the columns to make them easier to work with.

phackers <- phackers_raw %>% rename(job_title=`What is your job title?`,
                    dept=`What is your department (if applicable)?`,
                    sal_raw=`Please enter your approximate monthly base salary in Philippine Pesos.`,
                    sal_increase_raw=`How much more is your current salary, than your salary three years ago?`,
                    sal_bonus_raw=`Approximately how much was your bonus from last year (aside from any mandatory bonuses -- e.g 13th month)?`,
                    in_ph=`Are you currently living/working in the Philippines?`,
                    ph_city=`If you currently work in the Philippines, which city do you work in?`,
                    age_range=`What is your age?`,
                    exp_raw=`How many years of experience do you have in your field?`,
                    gender=`What is your gender?`,
                    educ_stmts=`Which of the following statements about your education are true?`,
                    university=`Which university did you attend last?`,
                    university_other=`If your university was not listed in the previous question, you may add it here.`,
                    certifications=`What certifications do you currently have?`,
                    employment_status=`Are you a full- or part-time employee of a company, or a freelancer?`,
                    company_industry=`What is your company's business or industry?`,
                    company_size=`How many employees work at your company?`,
                    company_age=`How long has your company been in business?`,
                    os_used=`Operating systems`,
                    proglang_used=`Programming Languages`,
                    datatools_used=`Data tools`,
                    cloud_used=`Cloud/Containers`,
                    worktasks_noncollab=`Which of the following tasks play a part in your workday? [Writing code for non-collaborative projects (no one else will work on this code)]`,
                    worktasks_collab=`Which of the following tasks play a part in your workday? [Writing code for collaborative projects]`,
                    worktasks_collabreview = `Which of the following tasks play a part in your workday? [Reading/editing code originally written by others (e.g., using git)]`)

Conversions

Convert string salary ranges to the numeric equivalent.

# phackers$sal_num <- parse_number(sapply(strsplit(phackers$sal_raw, " - "), function(x){x[1]}))
phackers$sal_num <- sapply(sapply(strsplit(phackers$sal_raw, ' - '), parse_number), mean)
# count less than 1 year as 0
phackers$exp_raw[phackers$exp_raw == "Less than 1 year"] <- 0
phackers$exp_num <- parse_number(phackers$exp_raw)
phackers$ph_city_f <- factor(phackers$ph_city)

Convert company size to factors so we can control order of facets

# Convert company size to factors so we can control order of facets
# http://stackoverflow.com/questions/14262497/fixing-the-order-of-facets-in-ggplot
phackers$company_size_f <- factor(phackers$company_size, levels=c("1", "2 - 25", "25 - 100", "101 - 500", 
                                                                            "501- 1,000", "1,001 - 2,500", "2,501 - 10,000", "10,001 or more"))
# Convert gender to factor
phackers$company_industry_f <- factor(phackers$company_industry)
phackers$gender <- factor(phackers$gender, levels=c("Male", "Female", "Prefer not to say"))
# Convert company age to factor as well
# unique(phackers$company_age) will get us the list of ages to sort
phackers$company_age_f <- factor(phackers$company_age, levels=c("less than 2 years", "2 - 5 years",
                                                                          "6 - 10 years", "11 - 20 years", 
                                                                          "20 years or longer"))
# Convert the employment status to factors as well
phackers$employment_f <- factor(phackers$employment_status, levels=c("Employed in a company, full or part-time", "Freelance or independent contractor"), labels=c("Employed", "Freelance"))
# levels for age
l <- sort(unique(phackers$age_range))
print(l)
[1] "21-25"    "26-30"    "31-35"    "36-40"    "41-45"    "46-50"    "51-55"    "Under 21"
age_levels <- c(l[8], l[1:7])
print(age_levels)
[1] "Under 21" "21-25"    "26-30"    "31-35"    "36-40"    "41-45"    "46-50"    "51-55"   
phackers$age <- factor(phackers$age_range, labels = age_levels)

Outlier labelling

We label as outliers those that: a. are outside the Philippines b. are outside Q3+3IQR in salary and years of experience

sal_smry <- summary(phackers$sal_num)
sal_smry
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  10000   32500   52500   78639   97500  500000 
sal_iqr = sal_smry['3rd Qu.'] - sal_smry['1st Qu.']
sal_upper_fence <- sal_smry['1st Qu.'] + 3 * sal_iqr
exp_summary<-summary(phackers$exp_num)
exp_iqr <- exp_summary['3rd Qu.'] - exp_summary['1st Qu.']
exp_upper_fence <- exp_summary['3rd Qu.'] + 3*exp_iqr
phackers$outlier <- phackers$sal_num > sal_upper_fence | 
  phackers$exp_num > exp_upper_fence | 
  phackers$in_ph == FALSE
phackers <- phackers %>% filter(!outlier)
# From this point on, no need to filter(!outlier)

Exploratory Data Analysis

Gender, Age and Experience

phackers %>% group_by(age, gender) %>% summarise(med_salary = median(sal_num), mean_sal=mean(sal_num), sd=sd(sal_num))
phackers %>% filter(gender != "Prefer not to say") %>% 
  ggplot(aes(x=exp_num, y=sal_num, color=gender)) + geom_jitter(alpha=0.5) +  
  geom_smooth(method="lm", se=FALSE) + scale_y_continuous(labels=comma) + xlab("Years of experience in field") +
  ylab("Salary")

phackers %>% group_by(age) %>% 
  summarise(n=n(), "percent"=(round(100 * n()/sum(count(phackers)),2)))
phackers %>% ggplot(aes(age, fill=gender)) + geom_bar(stat="count") + 
  ylab(label = "respondents") + ggtitle("Age Distribution", subtitle="by gender") 

phackers %>% ggplot(aes(x=exp_num,fill=gender)) + geom_histogram(binwidth = 1) + 
  ylab(label="respondents") + xlab("years of experience")

Where do they work?

Company Size and Age

phackers %>% filter(employment_f=="Employed") %>% 
  group_by(company_size_f) %>% 
  summarize(n=n(), percentage=round(100*n()/sum(count(phackers)),2)) %>% 
  rename("Company Size"=company_size_f)
phackers%>% filter(employment_f=="Employed") %>% ggplot(aes(company_size_f, fill=company_age_f)) + geom_bar(stat="count") +
  scale_fill_discrete("Company Age") + xlab("Company Size")

phackers %>% filter(employment_f=="Employed") %>%  group_by(company_age_f) %>% summarize(n=n(), percentage=round(100*n()/sum(count(phackers)),2)) %>% rename("Company Age"=company_age_f)
phackers %>% filter(employment_f=="Employed") %>% ggplot(aes(company_age_f, fill=company_size_f)) + geom_bar(stat="count") + 
  xlab("Company Age") + scale_fill_discrete("Company Size")

unique(phackers$company_size)
[1] "25 - 100"       "1"              "101 - 500"      "2 - 25"         "10,001 or more" "2,501 - 10,000"
[7] "1,001 - 2,500"  "501- 1,000"    
phackers %>% filter(employment_f=="Employed") %>% ggplot(aes(y=sal_num, x=exp_num, color=company_size_f)) +
  geom_jitter(alpha=0.5) + facet_wrap(~company_age_f) + ylab("Salary") + xlab("Years of experience") +
  scale_color_discrete(name="Company Size") + labs(title="Employee Salaries", subtitle="by Company Size and Age")