4 Basic Data Cleaning and Preparation Tasks
4.1 Rearranging columns
# Prints contents of "pt_copy"
pt_copy
## # A tibble: 85 × 75
## oecd country pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha legor_uk legor_so
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 Argenti… 0 0 213 0 183 0 0.268 0 0 0
## 2 1 Austral… 1 1 193 1 98 0.608 0 0 1 0
## 3 1 Austria 0 0 122 0 250 0 0 0 0 0
## 4 0 Bahamas 1 1 313 1 26 0.896 0 0 1 0
## 5 0 Banglad… 1 1 513 0 28 0 0 0.888 1 0
## 6 0 Barbados 1 1 316 1 33 0.868 0 0 1 0
## 7 0 Belarus 1 1 913 0 8 0 0 0.968 0 1
## 8 1 Belgium 0 0 124 0 169 0 0 0.324 0 0
## 9 0 Belize 1 1 339 1 18 0.928 0 0 1 0
## 10 0 Bolivia 0.116 0.116 218 0 174 0 0.304 0 0 0
## # … with 75 more rows, and 63 more variables: legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## # prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>,
## # logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>,
## # frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>,
## # rgdph <dbl>, trade <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, eduger <dbl>,
## # spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>, pres <dbl>, lyp <dbl>,
## # semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, dem_age <dbl>, lat01 <dbl>, …
# bring the "country" column to the front of the dataset
<-pt_copy %>% relocate(country)
pt_copy pt_copy
## # A tibble: 85 × 75
## country oecd pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha legor_uk legor_so
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Argenti… 0 0 0 213 0 183 0 0.268 0 0 0
## 2 Austral… 1 1 1 193 1 98 0.608 0 0 1 0
## 3 Austria 1 0 0 122 0 250 0 0 0 0 0
## 4 Bahamas 0 1 1 313 1 26 0.896 0 0 1 0
## 5 Banglad… 0 1 1 513 0 28 0 0 0.888 1 0
## 6 Barbados 0 1 1 316 1 33 0.868 0 0 1 0
## 7 Belarus 0 1 1 913 0 8 0 0 0.968 0 1
## 8 Belgium 1 0 0 124 0 169 0 0 0.324 0 0
## 9 Belize 0 1 1 339 1 18 0.928 0 0 1 0
## 10 Bolivia 0 0.116 0.116 218 0 174 0 0.304 0 0 0
## # … with 75 more rows, and 63 more variables: legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## # prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>,
## # logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>,
## # frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>,
## # rgdph <dbl>, trade <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, eduger <dbl>,
## # spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>, pres <dbl>, lyp <dbl>,
## # semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, dem_age <dbl>, lat01 <dbl>, …
# bring the "country", "list", "trade", "oecd" columns to the front of the dataset
<-pt_copy %>% relocate(country, list, trade, oecd)
pt_copy
# prints updated contents of "pt_copy"
pt_copy
## # A tibble: 85 × 75
## country list trade oecd pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Argentina 257. 18.4 0 0 0 213 0 183 0 0.268 0
## 2 Australia 0 38.8 1 1 1 193 1 98 0.608 0 0
## 3 Austria 183 78.3 1 0 0 122 0 250 0 0 0
## 4 Bahamas 0 102. 0 1 1 313 1 26 0.896 0 0
## 5 Bangladesh 0 25.4 0 1 1 513 0 28 0 0 0.888
## 6 Barbados 0 116. 0 1 1 316 1 33 0.868 0 0
## 7 Belarus 0 117. 0 1 1 913 0 8 0 0 0.968
## 8 Belgium 184. 132. 1 0 0 124 0 169 0 0 0.324
## 9 Belize 0 113. 0 1 1 339 1 18 0.928 0 0
## 10 Bolivia 115. 48.9 0 0.116 0.116 218 0 174 0 0.304 0
## # … with 75 more rows, and 63 more variables: legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>,
## # legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>,
## # govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## # engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## # cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>,
## # eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>,
## # pres <dbl>, lyp <dbl>, semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, …
4.2 Renaming variables
## Renaming a variable (renames "list" to "party_list")
<-pt_copy %>% rename(party_list=list)
pt_copy
# prints updated contents of "pt_copy"
pt_copy
## # A tibble: 85 × 75
## country party_list trade oecd pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Argentina 257. 18.4 0 0 0 213 0 183 0 0.268 0
## 2 Australia 0 38.8 1 1 1 193 1 98 0.608 0 0
## 3 Austria 183 78.3 1 0 0 122 0 250 0 0 0
## 4 Bahamas 0 102. 0 1 1 313 1 26 0.896 0 0
## 5 Banglade… 0 25.4 0 1 1 513 0 28 0 0 0.888
## 6 Barbados 0 116. 0 1 1 316 1 33 0.868 0 0
## 7 Belarus 0 117. 0 1 1 913 0 8 0 0 0.968
## 8 Belgium 184. 132. 1 0 0 124 0 169 0 0 0.324
## 9 Belize 0 113. 0 1 1 339 1 18 0.928 0 0
## 10 Bolivia 115. 48.9 0 0.116 0.116 218 0 174 0 0.304 0
## # … with 75 more rows, and 63 more variables: legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>,
## # legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>,
## # govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## # engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## # cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>,
## # eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>,
## # pres <dbl>, lyp <dbl>, semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, …
4.3 Sorting a dataset in ascending or descending order with respect to a variable
# sorting in ascending (low to high) order with respect to the "trade" variable
<-pt_copy %>% arrange(trade)
pt_copy
# prints updated contents of "pt_copy"
pt_copy
## # A tibble: 85 × 75
## country party_list trade oecd pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Brazil 510. 17.6 0 0 1 223 0 177 0 0.292 0
## 2 Argentina 257. 18.4 0 0 0 213 0 183 0 0.268 0
## 3 Japan 67.7 18.8 1 0.867 0.867 158 0 250 0 0 0
## 4 India 0 21.9 0 1 1 534 1 52 0.792 0 0
## 5 USA 0 23.0 1 1 1 111 0 250 0 0 0
## 6 Banglade… 0 25.4 0 1 1 513 0 28 0 0 0.888
## 7 Peru 153. 25.9 0 0 0 293 0 178 0 0.288 0
## 8 Uganda 0 30.8 0 1 1 746 1 37 0.852 0 0
## 9 Colombia 157. 34.8 0 0 0 233 0 189 0 0.244 0
## 10 Pakistan 0 38.7 0 1 1 564 1 52 0.792 0 0
## # … with 75 more rows, and 63 more variables: legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>,
## # legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>,
## # govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## # engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## # cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>,
## # eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>,
## # pres <dbl>, lyp <dbl>, semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, …
# sorting in descending (high to low) order with respect to the "trade" variable
<-pt_copy %>% arrange(desc(trade))
pt_copy
# prints updated contents of "pt_copy"
pt_copy
## # A tibble: 85 × 75
## country party_list trade oecd pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Singapore 0 343. 0 1 1 576 1 34 0.864 0 0.864
## 2 Malta 65.9 190. 0 0 1 181 1 35 0.860 0 0
## 3 Luxembou… 60 189. 1 0 1 137 0 160 0 0 0.360
## 4 Malaysia 0 176. 0 1 1 548 1 42 0.832 0 0
## 5 Estonia 101 154. 0 0 1 939 0 8 0 0 0.968
## 6 Belgium 184. 132. 1 0 0 124 0 169 0 0 0.324
## 7 Ireland 166 129. 1 0 1 178 1 78 0.688 0 0
## 8 Mauritius 0 128. 0 1 1 684 1 31 0.876 0 0
## 9 St. Vinc… 0 123. 0 1 1 364 1 20 0.920 0 0
## 10 Jamaica 0 122. 0 1 1 343 1 37 0.852 0 0
## # … with 75 more rows, and 63 more variables: legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>,
## # legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>,
## # govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## # engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## # cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>,
## # eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>,
## # pres <dbl>, lyp <dbl>, semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, …
4.4 Creating new variables based on existing variables
# Create new variable named "non_catholic_80" that is calculated by substracting the Catholic share of the population in 1980 ("catho80") from 100 and relocates "country", "catho80", and the newly created "non_catholic_80" to the front of the dataset
<-pt_copy %>% mutate(non_catholic_80=100-catho80) %>%
pt_copyrelocate(country, catho80, non_catholic_80)
# prints updated contents of "pt_copy"
pt_copy
## # A tibble: 85 × 76
## country catho80 non_catholic_80 party_list trade oecd pind pindo ctrycd col_uk t_indep
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Singapore 4.70 95.3 0 343. 0 1 1 576 1 34
## 2 Malta 97.3 2.70 65.9 190. 0 0 1 181 1 35
## 3 Luxembourg 93 7 60 189. 1 0 1 137 0 160
## 4 Malaysia 2.80 97.2 0 176. 0 1 1 548 1 42
## 5 Estonia 2 98 101 154. 0 0 1 939 0 8
## 6 Belgium 90 10 184. 132. 1 0 0 124 0 169
## 7 Ireland 95.3 4.70 166 129. 1 0 1 178 1 78
## 8 Mauritius 31.2 68.8 0 128. 0 1 1 684 1 31
## 9 St. Vincen… 19.4 80.6 0 123. 0 1 1 364 1 20
## 10 Jamaica 9.60 90.4 0 122. 0 1 1 343 1 37
## # … with 75 more rows, and 65 more variables: col_uka <dbl>, col_espa <dbl>, col_otha <dbl>,
## # legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## # prot80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>,
## # yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>,
## # gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>,
## # prop65 <dbl>, federal <dbl>, eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>,
## # seats <dbl>, maj <dbl>, pres <dbl>, lyp <dbl>, semi <dbl>, majpar <dbl>, majpres <dbl>, …
4.5 Selecting or deleting variables
4.5.1 Selecting variables
# Selects "country", "cgexp", "cgrev", and "trade" variables from the "pt_copy" dataset
%>% select(country, cgexp, cgrev, trade) pt_copy
## # A tibble: 85 × 4
## country cgexp cgrev trade
## <chr> <dbl> <dbl> <dbl>
## 1 Singapore 18.5 34.7 343.
## 2 Malta 41.0 35.0 190.
## 3 Luxembourg 40.2 45.5 189.
## 4 Malaysia 24.5 26.8 176.
## 5 Estonia 30.0 31.1 154.
## 6 Belgium 47.9 43.7 132.
## 7 Ireland 38.1 34.8 129.
## 8 Mauritius 22.5 21.6 128.
## 9 St. Vincent&G 34.8 28.7 123.
## 10 Jamaica NA NA 122.
## # … with 75 more rows
# Selects "country", "cgexp", "cgrev", and "trade" variables from the "pt_copy" dataset and assigns the selection to a new object named "pt_copy_selection"
<-pt_copy %>% select(country, cgexp, cgrev, trade) pt_copy_selection
# Prints Contents of "pt_copy_selection"
pt_copy_selection
## # A tibble: 85 × 4
## country cgexp cgrev trade
## <chr> <dbl> <dbl> <dbl>
## 1 Singapore 18.5 34.7 343.
## 2 Malta 41.0 35.0 190.
## 3 Luxembourg 40.2 45.5 189.
## 4 Malaysia 24.5 26.8 176.
## 5 Estonia 30.0 31.1 154.
## 6 Belgium 47.9 43.7 132.
## 7 Ireland 38.1 34.8 129.
## 8 Mauritius 22.5 21.6 128.
## 9 St. Vincent&G 34.8 28.7 123.
## 10 Jamaica NA NA 122.
## # … with 75 more rows
4.5.2 Deleting variables
# Deletes "cgrev" variable from "pt_copy_selection" dataset
%>% select(-cgrev) pt_copy_selection
## # A tibble: 85 × 3
## country cgexp trade
## <chr> <dbl> <dbl>
## 1 Singapore 18.5 343.
## 2 Malta 41.0 190.
## 3 Luxembourg 40.2 189.
## 4 Malaysia 24.5 176.
## 5 Estonia 30.0 154.
## 6 Belgium 47.9 132.
## 7 Ireland 38.1 129.
## 8 Mauritius 22.5 128.
## 9 St. Vincent&G 34.8 123.
## 10 Jamaica NA 122.
## # … with 75 more rows
# Deletes "cgrev" AND "cgexp" variables from "pt_copy_selection" dataset
%>% select(-c(cgexp, cgrev)) pt_copy_selection
## # A tibble: 85 × 2
## country trade
## <chr> <dbl>
## 1 Singapore 343.
## 2 Malta 190.
## 3 Luxembourg 189.
## 4 Malaysia 176.
## 5 Estonia 154.
## 6 Belgium 132.
## 7 Ireland 129.
## 8 Mauritius 128.
## 9 St. Vincent&G 123.
## 10 Jamaica 122.
## # … with 75 more rows
# Deletes "cgrev" AND "cgexp" variables from "pt_copy_selection" dataset and assigns the result to a new object named "pt_copy_trade"
<-pt_copy_selection %>% select(-c(cgexp, cgrev)) pt_copy_trade
# Prints contents of "pt_copy_trade_revexp"
pt_copy_trade
## # A tibble: 85 × 2
## country trade
## <chr> <dbl>
## 1 Singapore 343.
## 2 Malta 190.
## 3 Luxembourg 189.
## 4 Malaysia 176.
## 5 Estonia 154.
## 6 Belgium 132.
## 7 Ireland 129.
## 8 Mauritius 128.
## 9 St. Vincent&G 123.
## 10 Jamaica 122.
## # … with 75 more rows
# Deletes "cgrev" AND "cgexp" variables from "pt_copy_selection" dataset and assigns the result to "pt_copy_selection", thereby overwriting the existing version of "pt_copy_selection" with a new version that reflects these deletions
<-pt_copy_selection %>% select(-c(cgexp, cgrev)) pt_copy_selection
# prints updated contents of "pt_copy_selection"
pt_copy_selection
## # A tibble: 85 × 2
## country trade
## <chr> <dbl>
## 1 Singapore 343.
## 2 Malta 190.
## 3 Luxembourg 189.
## 4 Malaysia 176.
## 5 Estonia 154.
## 6 Belgium 132.
## 7 Ireland 129.
## 8 Mauritius 128.
## 9 St. Vincent&G 123.
## 10 Jamaica 122.
## # … with 75 more rows
4.6 Recoding variables
4.6.1 Creating Dummy Variables from Continuous Numeric Variables
# Creates a new dummy variable based on the existing "trade" variable named "trade_open" (which takes on a value of "1" if "trade" is greater than or equal to 77, and 0 otherwise) and then moves the newly created variable to the front of the dataset along with "country" and "trade"; all changes are assigned to "pt_copy", thereby overwriting the existing version of "pt_copy"
<-pt_copy %>% mutate(trade_open=ifelse(trade>=77, 1, 0)) %>%
pt_copyrelocate(country, trade_open, trade)
# prints updated contents of "pt_copy"; note the newly created dummy variable
pt_copy
## # A tibble: 85 × 77
## country trade_open trade catho80 non_catholic_80 party_list oecd pind pindo ctrycd col_uk
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Singapo… 1 343. 4.70 95.3 0 0 1 1 576 1
## 2 Malta 1 190. 97.3 2.70 65.9 0 0 1 181 1
## 3 Luxembo… 1 189. 93 7 60 1 0 1 137 0
## 4 Malaysia 1 176. 2.80 97.2 0 0 1 1 548 1
## 5 Estonia 1 154. 2 98 101 0 0 1 939 0
## 6 Belgium 1 132. 90 10 184. 1 0 0 124 0
## 7 Ireland 1 129. 95.3 4.70 166 1 0 1 178 1
## 8 Mauriti… 1 128. 31.2 68.8 0 0 1 1 684 1
## 9 St. Vin… 1 123. 19.4 80.6 0 0 1 1 364 1
## 10 Jamaica 1 122. 9.60 90.4 0 0 1 1 343 1
## # … with 75 more rows, and 66 more variables: t_indep <dbl>, col_uka <dbl>, col_espa <dbl>,
## # col_otha <dbl>, legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>,
## # legor_sc <dbl>, prot80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>,
## # logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>,
## # frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>,
## # rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, eduger <dbl>, spropn <dbl>,
## # yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>, pres <dbl>, lyp <dbl>, semi <dbl>, …
4.6.2 Creating categorical variables from continuous numeric variables
# Creates a new variable in the "pt_copy" dataset named "trade_level" (that is coded as "Low Trade" when the "trade" variable is greater than 15 and less than 50, coded as "Intermediate Trade" when "trade" is greater than or equal to 50 and less than 100, and coded as "High TradE" when "trade" is greater than or equal to 100), and then reorders the dataset such that "country", "trade_level", and "trade" are the first three variables in the dataset
<-pt_copy %>% mutate(trade_level=case_when(trade>15 & trade<50~"Low_Trade",
pt_copy>=50 & trade<100~"Intermediate_Trade",
trade>=100~"High_Trade")) %>%
traderelocate(country, trade_level, trade)
# prints updated contents of "pt_copy"; note the newly created categorical variable
pt_copy
## # A tibble: 85 × 78
## country trade_level trade trade_open catho80 non_catholic_80 party_list oecd pind pindo
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Singapore High_Trade 343. 1 4.70 95.3 0 0 1 1
## 2 Malta High_Trade 190. 1 97.3 2.70 65.9 0 0 1
## 3 Luxembourg High_Trade 189. 1 93 7 60 1 0 1
## 4 Malaysia High_Trade 176. 1 2.80 97.2 0 0 1 1
## 5 Estonia High_Trade 154. 1 2 98 101 0 0 1
## 6 Belgium High_Trade 132. 1 90 10 184. 1 0 0
## 7 Ireland High_Trade 129. 1 95.3 4.70 166 1 0 1
## 8 Mauritius High_Trade 128. 1 31.2 68.8 0 0 1 1
## 9 St. Vince… High_Trade 123. 1 19.4 80.6 0 0 1 1
## 10 Jamaica High_Trade 122. 1 9.60 90.4 0 0 1 1
## # … with 75 more rows, and 68 more variables: ctrycd <dbl>, col_uk <dbl>, t_indep <dbl>,
## # col_uka <dbl>, col_espa <dbl>, col_otha <dbl>, legor_uk <dbl>, legor_so <dbl>,
## # legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, confu <dbl>, avelf <dbl>,
## # govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## # engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## # cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>,
## # eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>, …
4.6.3 Creating dummmy variables from categorical variables
# Creates dummy variables from "trade_level" column, and relocates the new dummies to the front of the dataset
<-pt_copy %>% dummy_cols("trade_level") %>%
pt_copyrelocate(country, trade_level, trade_level_High_Trade, trade_level_Intermediate_Trade, trade_level_Low_Trade)
# Prints contents of "pt_copy"
pt_copy
## # A tibble: 85 × 81
## country trade_level trade_level_Hig… trade_level_Int… trade_level_Low… trade trade_open
## <chr> <chr> <int> <int> <int> <dbl> <dbl>
## 1 Singapore High_Trade 1 0 0 343. 1
## 2 Malta High_Trade 1 0 0 190. 1
## 3 Luxembourg High_Trade 1 0 0 189. 1
## 4 Malaysia High_Trade 1 0 0 176. 1
## 5 Estonia High_Trade 1 0 0 154. 1
## 6 Belgium High_Trade 1 0 0 132. 1
## 7 Ireland High_Trade 1 0 0 129. 1
## 8 Mauritius High_Trade 1 0 0 128. 1
## 9 St. Vincent… High_Trade 1 0 0 123. 1
## 10 Jamaica High_Trade 1 0 0 122. 1
## # … with 75 more rows, and 74 more variables: catho80 <dbl>, non_catholic_80 <dbl>,
## # party_list <dbl>, oecd <dbl>, pind <dbl>, pindo <dbl>, ctrycd <dbl>, col_uk <dbl>,
## # t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>, legor_uk <dbl>,
## # legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, confu <dbl>,
## # avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## # engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## # cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, …
4.7 Subsetting rows data based on criteria
4.7.1 The filter
function
# Extracts OECD observations in "pt_copy" and assigns to object named "oecd_countries"
<-pt_copy %>% filter(oecd==1) %>%
oecd_countriesrelocate(country, oecd)
# Prints contents of "oecd_countries"
oecd_countries
## # A tibble: 25 × 81
## country oecd trade_level trade_level_Hig… trade_level_Int… trade_level_Low… trade
## <chr> <dbl> <chr> <int> <int> <int> <dbl>
## 1 Luxembourg 1 High_Trade 1 0 0 189.
## 2 Belgium 1 High_Trade 1 0 0 132.
## 3 Ireland 1 High_Trade 1 0 0 129.
## 4 Netherlands 1 High_Trade 1 0 0 100.
## 5 Austria 1 Intermediate_Tra… 0 1 0 78.3
## 6 Norway 1 Intermediate_Tra… 0 1 0 72.2
## 7 Switzerland 1 Intermediate_Tra… 0 1 0 68.9
## 8 Portugal 1 Intermediate_Tra… 0 1 0 68.5
## 9 Sweden 1 Intermediate_Tra… 0 1 0 68.1
## 10 Iceland 1 Intermediate_Tra… 0 1 0 66.9
## # … with 15 more rows, and 74 more variables: trade_open <dbl>, catho80 <dbl>,
## # non_catholic_80 <dbl>, party_list <dbl>, pind <dbl>, pindo <dbl>, ctrycd <dbl>,
## # col_uk <dbl>, t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>,
## # legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## # prot80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>,
## # yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>,
## # gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, …
# Extracts observations for which cgrev (central government revenue as % of gdp)>40, and assigns to object named "high_revenues"
<-pt_copy %>% filter(cgrev>40) %>%
high_revenuesrelocate(country, cgrev)
# Prints contents of "high_revenues"
high_revenues
## # A tibble: 10 × 81
## country cgrev trade_level trade_level_Hig… trade_level_Int… trade_level_Low… trade
## <chr> <dbl> <chr> <int> <int> <int> <dbl>
## 1 Luxembourg 45.5 High_Trade 1 0 0 189.
## 2 Belgium 43.7 High_Trade 1 0 0 132.
## 3 Netherlands 47.6 High_Trade 1 0 0 100.
## 4 Botswana 50.8 Intermediate_Tra… 0 1 0 87.5
## 5 Hungary 45.6 Intermediate_Tra… 0 1 0 73.7
## 6 Norway 41.1 Intermediate_Tra… 0 1 0 72.2
## 7 Sweden 40.8 Intermediate_Tra… 0 1 0 68.1
## 8 Poland 40.3 Low_Trade 0 0 1 48.3
## 9 France 40.9 Low_Trade 0 0 1 44.9
## 10 Italy 41.2 Low_Trade 0 0 1 44.3
## # … with 74 more variables: trade_open <dbl>, catho80 <dbl>, non_catholic_80 <dbl>,
## # party_list <dbl>, oecd <dbl>, pind <dbl>, pindo <dbl>, ctrycd <dbl>, col_uk <dbl>,
## # t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>, legor_uk <dbl>,
## # legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, confu <dbl>,
## # avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## # engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## # ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, eduger <dbl>, …
# Extracts observations for which the "catho80" variable is less than or equal to 50
<-pt_copy %>% filter(catho80<=50) %>%
minority_catholicrelocate(country, catho80)
# Prints contents of "minority_catholic"
minority_catholic
## # A tibble: 53 × 81
## country catho80 trade_level trade_level_High_… trade_level_Int… trade_level_Low… trade
## <chr> <dbl> <chr> <int> <int> <int> <dbl>
## 1 Singapore 4.70 High_Trade 1 0 0 343.
## 2 Malaysia 2.80 High_Trade 1 0 0 176.
## 3 Estonia 2 High_Trade 1 0 0 154.
## 4 Mauritius 31.2 High_Trade 1 0 0 128.
## 5 St. Vincent&G 19.4 High_Trade 1 0 0 123.
## 6 Jamaica 9.60 High_Trade 1 0 0 122.
## 7 Gambia 1.90 High_Trade 1 0 0 122.
## 8 Fiji 9 High_Trade 1 0 0 119.
## 9 Belarus 14 High_Trade 1 0 0 117.
## 10 Barbados 5.90 High_Trade 1 0 0 116.
## # … with 43 more rows, and 74 more variables: trade_open <dbl>, non_catholic_80 <dbl>,
## # party_list <dbl>, oecd <dbl>, pind <dbl>, pindo <dbl>, ctrycd <dbl>, col_uk <dbl>,
## # t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>, legor_uk <dbl>,
## # legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, confu <dbl>,
## # avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## # engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## # cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, …
Using the &
operator
# Extracts federal OECD countries (where oecd=1 AND federal=1) and assigns to a new object named "oecd_federal_countries"
<-pt_copy %>% filter(oecd==1 & federal==1) %>%
oecd_federal_countriesrelocate(country, oecd, federal)
# prints contents of "oecd_federal_countries"
oecd_federal_countries
## # A tibble: 7 × 81
## country oecd federal trade_level trade_level_Hig… trade_level_Int… trade_level_Low… trade
## <chr> <dbl> <dbl> <chr> <int> <int> <int> <dbl>
## 1 Austria 1 1 Intermedia… 0 1 0 78.3
## 2 Switzerla… 1 1 Intermedia… 0 1 0 68.9
## 3 Canada 1 1 Intermedia… 0 1 0 66.5
## 4 Germany 1 1 Low_Trade 0 0 1 48.8
## 5 Mexico 1 1 Low_Trade 0 0 1 47.5
## 6 Australia 1 1 Low_Trade 0 0 1 38.8
## 7 USA 1 1 Low_Trade 0 0 1 23.0
## # … with 73 more variables: trade_open <dbl>, catho80 <dbl>, non_catholic_80 <dbl>,
## # party_list <dbl>, pind <dbl>, pindo <dbl>, ctrycd <dbl>, col_uk <dbl>, t_indep <dbl>,
## # col_uka <dbl>, col_espa <dbl>, col_otha <dbl>, legor_uk <dbl>, legor_so <dbl>,
## # legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, confu <dbl>, avelf <dbl>,
## # govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## # engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## # cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, eduger <dbl>, …
Using the |
operator
# Extracts observations that are in Africa ("africa") OR in Asia/Europe ("asiae) and assigns to an object named "asia_europe_africa"
<-pt_copy %>% filter(continent=="africa"|continent=="asiae") %>%
asia_europe_africarelocate(continent)
# Prints contents of "asia_europe_africa"
asia_europe_africa
## # A tibble: 24 × 81
## continent country trade_level trade_level_Hig… trade_level_Int… trade_level_Low… trade
## <chr> <chr> <chr> <int> <int> <int> <dbl>
## 1 asiae Singapore High_Trade 1 0 0 343.
## 2 asiae Malaysia High_Trade 1 0 0 176.
## 3 africa Mauritius High_Trade 1 0 0 128.
## 4 africa Gambia High_Trade 1 0 0 122.
## 5 asiae Fiji High_Trade 1 0 0 119.
## 6 africa Namibia High_Trade 1 0 0 115.
## 7 asiae Papua N. Guin High_Trade 1 0 0 102.
## 8 asiae Taiwan Intermedia… 0 1 0 90.7
## 9 africa Botswana Intermedia… 0 1 0 87.5
## 10 asiae Thailand Intermedia… 0 1 0 84.9
## # … with 14 more rows, and 74 more variables: trade_open <dbl>, catho80 <dbl>,
## # non_catholic_80 <dbl>, party_list <dbl>, oecd <dbl>, pind <dbl>, pindo <dbl>,
## # ctrycd <dbl>, col_uk <dbl>, t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>,
## # legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## # prot80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>,
## # yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>,
## # gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, …
Filtering for observations that do NOT meet a condition:
# Extracts all non-Africa observations and assigns to object named "pt_copy_sans_africa"
<-pt_copy %>% filter(continent!="africa") %>% relocate(continent) pt_copy_sans_africa
# Prints contents of "pt_copy_sans_africa"
pt_copy_sans_africa
## # A tibble: 74 × 81
## continent country trade_level trade_level_Hig… trade_level_Int… trade_level_Low… trade
## <chr> <chr> <chr> <int> <int> <int> <dbl>
## 1 asiae Singapore High_Trade 1 0 0 343.
## 2 other Malta High_Trade 1 0 0 190.
## 3 other Luxembourg High_Trade 1 0 0 189.
## 4 asiae Malaysia High_Trade 1 0 0 176.
## 5 other Estonia High_Trade 1 0 0 154.
## 6 other Belgium High_Trade 1 0 0 132.
## 7 other Ireland High_Trade 1 0 0 129.
## 8 laam St. Vincent&G High_Trade 1 0 0 123.
## 9 laam Jamaica High_Trade 1 0 0 122.
## 10 other Slovak Repub… High_Trade 1 0 0 119.
## # … with 64 more rows, and 74 more variables: trade_open <dbl>, catho80 <dbl>,
## # non_catholic_80 <dbl>, party_list <dbl>, oecd <dbl>, pind <dbl>, pindo <dbl>,
## # ctrycd <dbl>, col_uk <dbl>, t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>,
## # legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## # prot80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>,
## # yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>,
## # gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, …