library(tidyverse) # data wrangling
library(gapminder) # dataset
Take a look at the dataset. Tibbles are a type of dataframes that work better with tidyverse. To see the full dataset use View()
gapminder # print the dataset
## # A tibble: 1,704 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 1,694 more rows
filter()
gapminder %>%
filter(country == "India") # filter observations according to specific country
## # A tibble: 12 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 India Asia 1952 37.4 372000000 547.
## 2 India Asia 1957 40.2 409000000 590.
## 3 India Asia 1962 43.6 454000000 658.
## 4 India Asia 1967 47.2 506000000 701.
## 5 India Asia 1972 50.7 567000000 724.
## 6 India Asia 1977 54.2 634000000 813.
## 7 India Asia 1982 56.6 708000000 856.
## 8 India Asia 1987 58.6 788000000 977.
## 9 India Asia 1992 60.2 872000000 1164.
## 10 India Asia 1997 61.8 959000000 1459.
## 11 India Asia 2002 62.9 1034172547 1747.
## 12 India Asia 2007 64.7 1110396331 2452.
gapminder %>%
filter(country == "India", year < 1975) # filter using two conditions
## # A tibble: 5 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 India Asia 1952 37.4 372000000 547.
## 2 India Asia 1957 40.2 409000000 590.
## 3 India Asia 1962 43.6 454000000 658.
## 4 India Asia 1967 47.2 506000000 701.
## 5 India Asia 1972 50.7 567000000 724.
gapminder %>%
filter(country=="India" | country == "Nepal") # filter using "or" condition
## # A tibble: 24 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 India Asia 1952 37.4 372000000 547.
## 2 India Asia 1957 40.2 409000000 590.
## 3 India Asia 1962 43.6 454000000 658.
## 4 India Asia 1967 47.2 506000000 701.
## 5 India Asia 1972 50.7 567000000 724.
## 6 India Asia 1977 54.2 634000000 813.
## 7 India Asia 1982 56.6 708000000 856.
## 8 India Asia 1987 58.6 788000000 977.
## 9 India Asia 1992 60.2 872000000 1164.
## 10 India Asia 1997 61.8 959000000 1459.
## # … with 14 more rows
gapminder %>%
filter(year %in% c(1952,1962, 1972 )) # filter using multiple values
## # A tibble: 426 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1962 32.0 10267083 853.
## 3 Afghanistan Asia 1972 36.1 13079460 740.
## 4 Albania Europe 1952 55.2 1282697 1601.
## 5 Albania Europe 1962 64.8 1728137 2313.
## 6 Albania Europe 1972 67.7 2263554 3313.
## 7 Algeria Africa 1952 43.1 9279525 2449.
## 8 Algeria Africa 1962 48.3 11000948 2551.
## 9 Algeria Africa 1972 54.5 14760787 4183.
## 10 Angola Africa 1952 30.0 4232095 3521.
## # … with 416 more rows
arrange()
gapminder %>%
filter(year %in% c(1952,1962, 1972 )) %>%
arrange(year) # arrange in ascending order by default
## # A tibble: 426 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Albania Europe 1952 55.2 1282697 1601.
## 3 Algeria Africa 1952 43.1 9279525 2449.
## 4 Angola Africa 1952 30.0 4232095 3521.
## 5 Argentina Americas 1952 62.5 17876956 5911.
## 6 Australia Oceania 1952 69.1 8691212 10040.
## 7 Austria Europe 1952 66.8 6927772 6137.
## 8 Bahrain Asia 1952 50.9 120447 9867.
## 9 Bangladesh Asia 1952 37.5 46886859 684.
## 10 Belgium Europe 1952 68 8730405 8343.
## # … with 416 more rows
gapminder %>%
filter(year %in% c(1952,1962, 1972 )) %>%
arrange(desc(year)) # arrange in descending order
## # A tibble: 426 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1972 36.1 13079460 740.
## 2 Albania Europe 1972 67.7 2263554 3313.
## 3 Algeria Africa 1972 54.5 14760787 4183.
## 4 Angola Africa 1972 37.9 5894858 5473.
## 5 Argentina Americas 1972 67.1 24779799 9443.
## 6 Australia Oceania 1972 71.9 13177000 16789.
## 7 Austria Europe 1972 70.6 7544201 16662.
## 8 Bahrain Asia 1972 63.3 230800 18269.
## 9 Bangladesh Asia 1972 45.3 70759295 630.
## 10 Belgium Europe 1972 71.4 9709100 16672.
## # … with 416 more rows
select()
gapminder %>%
select(country, year, pop) # select variables of interest
## # A tibble: 1,704 × 3
## country year pop
## <fct> <int> <int>
## 1 Afghanistan 1952 8425333
## 2 Afghanistan 1957 9240934
## 3 Afghanistan 1962 10267083
## 4 Afghanistan 1967 11537966
## 5 Afghanistan 1972 13079460
## 6 Afghanistan 1977 14880372
## 7 Afghanistan 1982 12881816
## 8 Afghanistan 1987 13867957
## 9 Afghanistan 1992 16317921
## 10 Afghanistan 1997 22227415
## # … with 1,694 more rows
gapminder %>%
select(-pop) # drop variables using -
## # A tibble: 1,704 × 5
## country continent year lifeExp gdpPercap
## <fct> <fct> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 779.
## 2 Afghanistan Asia 1957 30.3 821.
## 3 Afghanistan Asia 1962 32.0 853.
## 4 Afghanistan Asia 1967 34.0 836.
## 5 Afghanistan Asia 1972 36.1 740.
## 6 Afghanistan Asia 1977 38.4 786.
## 7 Afghanistan Asia 1982 39.9 978.
## 8 Afghanistan Asia 1987 40.8 852.
## 9 Afghanistan Asia 1992 41.7 649.
## 10 Afghanistan Asia 1997 41.8 635.
## # … with 1,694 more rows
mutate()
gapminder %>%
select(country, pop) %>%
mutate(pop_mil = round(pop/1000000, 1)) # add a new variable
## # A tibble: 1,704 × 3
## country pop pop_mil
## <fct> <int> <dbl>
## 1 Afghanistan 8425333 8.4
## 2 Afghanistan 9240934 9.2
## 3 Afghanistan 10267083 10.3
## 4 Afghanistan 11537966 11.5
## 5 Afghanistan 13079460 13.1
## 6 Afghanistan 14880372 14.9
## 7 Afghanistan 12881816 12.9
## 8 Afghanistan 13867957 13.9
## 9 Afghanistan 16317921 16.3
## 10 Afghanistan 22227415 22.2
## # … with 1,694 more rows
gapminder %>%
select(country, year, pop) %>%
mutate(pop_rank = min_rank(pop)) %>% # add new variable using rank
arrange(pop_rank) # arrange in ascening order
## # A tibble: 1,704 × 4
## country year pop pop_rank
## <fct> <int> <int> <int>
## 1 Sao Tome and Principe 1952 60011 1
## 2 Sao Tome and Principe 1957 61325 2
## 3 Djibouti 1952 63149 3
## 4 Sao Tome and Principe 1962 65345 4
## 5 Sao Tome and Principe 1967 70787 5
## 6 Djibouti 1957 71851 6
## 7 Sao Tome and Principe 1972 76595 7
## 8 Sao Tome and Principe 1977 86796 8
## 9 Djibouti 1962 89898 9
## 10 Sao Tome and Principe 1982 98593 10
## # … with 1,694 more rows
gapminder %>%
select(country, pop) %>%
transmute(pop_mil = round(pop/1000000, 1)) # only keep the new variable
## # A tibble: 1,704 × 1
## pop_mil
## <dbl>
## 1 8.4
## 2 9.2
## 3 10.3
## 4 11.5
## 5 13.1
## 6 14.9
## 7 12.9
## 8 13.9
## 9 16.3
## 10 22.2
## # … with 1,694 more rows
gapminder %>%
select (country, continent, year, pop) %>%
mutate(cont_code = case_when(continent == "Africa" ~ 1,
continent == "Americas" ~ 2,
continent == "Asia" ~ 3,
continent == "Europe" ~ 4,
TRUE ~ 5)) ## option when none of the above conditions are true
## # A tibble: 1,704 × 5
## country continent year pop cont_code
## <fct> <fct> <int> <int> <dbl>
## 1 Afghanistan Asia 1952 8425333 3
## 2 Afghanistan Asia 1957 9240934 3
## 3 Afghanistan Asia 1962 10267083 3
## 4 Afghanistan Asia 1967 11537966 3
## 5 Afghanistan Asia 1972 13079460 3
## 6 Afghanistan Asia 1977 14880372 3
## 7 Afghanistan Asia 1982 12881816 3
## 8 Afghanistan Asia 1987 13867957 3
## 9 Afghanistan Asia 1992 16317921 3
## 10 Afghanistan Asia 1997 22227415 3
## # … with 1,694 more rows
gapminder %>%
summarise(mean_lifeExp = mean(lifeExp)) # summarise the entire data in one line
## # A tibble: 1 × 1
## mean_lifeExp
## <dbl>
## 1 59.5
gapminder %>%
group_by(continent) %>% # group by continent
summarise(mean_lifeExp = mean(lifeExp)) # summarise by groups
## # A tibble: 5 × 2
## continent mean_lifeExp
## <fct> <dbl>
## 1 Africa 48.9
## 2 Americas 64.7
## 3 Asia 60.1
## 4 Europe 71.9
## 5 Oceania 74.3
# both codes below will give the same output
gapminder %>%
count(continent)
## # A tibble: 5 × 2
## continent n
## <fct> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
## 5 Oceania 24
gapminder %>%
group_by(continent) %>%
summarise(observations = n())
## # A tibble: 5 × 2
## continent observations
## <fct> <int>
## 1 Africa 624
## 2 Americas 300
## 3 Asia 396
## 4 Europe 360
## 5 Oceania 24
gapminder %>%
filter(country == "India"| country == "Nepal",
year %in% c(1997, 2002, 2007))
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 India Asia 1997 61.8 959000000 1459.
## 2 India Asia 2002 62.9 1034172547 1747.
## 3 India Asia 2007 64.7 1110396331 2452.
## 4 Nepal Asia 1997 59.4 23001113 1011.
## 5 Nepal Asia 2002 61.3 25873917 1057.
## 6 Nepal Asia 2007 63.8 28901790 1091.
gapminder %>%
filter(continent == "Asia") %>%
group_by(country) %>%
summarise(mean(lifeExp))
## # A tibble: 33 × 2
## country `mean(lifeExp)`
## <fct> <dbl>
## 1 Afghanistan 37.5
## 2 Bahrain 65.6
## 3 Bangladesh 49.8
## 4 Cambodia 47.9
## 5 China 61.8
## 6 Hong Kong, China 73.5
## 7 India 53.2
## 8 Indonesia 54.3
## 9 Iran 58.6
## 10 Iraq 56.6
## # … with 23 more rows
gapminder %>%
filter(continent == "Asia", year == 2007) %>%
group_by(country) %>%
summarise(avg_gdp = mean(gdpPercap)) %>%
arrange(avg_gdp) %>%
tail()
## # A tibble: 6 × 2
## country avg_gdp
## <fct> <dbl>
## 1 Taiwan 28718.
## 2 Bahrain 29796.
## 3 Japan 31656.
## 4 Hong Kong, China 39725.
## 5 Singapore 47143.
## 6 Kuwait 47307.