4 Basic Data Cleaning and Preparation Tasks

4.1 Rearranging columns

# Prints contents of "pt_copy"
pt_copy
## # A tibble: 85 × 75
##     oecd country   pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha legor_uk legor_so
##    <dbl> <chr>    <dbl> <dbl>  <dbl>  <dbl>   <dbl>   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
##  1     0 Argenti… 0     0        213      0     183   0        0.268    0            0        0
##  2     1 Austral… 1     1        193      1      98   0.608    0        0            1        0
##  3     1 Austria  0     0        122      0     250   0        0        0            0        0
##  4     0 Bahamas  1     1        313      1      26   0.896    0        0            1        0
##  5     0 Banglad… 1     1        513      0      28   0        0        0.888        1        0
##  6     0 Barbados 1     1        316      1      33   0.868    0        0            1        0
##  7     0 Belarus  1     1        913      0       8   0        0        0.968        0        1
##  8     1 Belgium  0     0        124      0     169   0        0        0.324        0        0
##  9     0 Belize   1     1        339      1      18   0.928    0        0            1        0
## 10     0 Bolivia  0.116 0.116    218      0     174   0        0.304    0            0        0
## # … with 75 more rows, and 63 more variables: legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## #   prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>,
## #   logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>,
## #   frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>,
## #   rgdph <dbl>, trade <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, eduger <dbl>,
## #   spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>, pres <dbl>, lyp <dbl>,
## #   semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, dem_age <dbl>, lat01 <dbl>, …
# bring the "country" column to the front of the dataset
pt_copy<-pt_copy %>% relocate(country)
pt_copy
## # A tibble: 85 × 75
##    country   oecd  pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha legor_uk legor_so
##    <chr>    <dbl> <dbl> <dbl>  <dbl>  <dbl>   <dbl>   <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
##  1 Argenti…     0 0     0        213      0     183   0        0.268    0            0        0
##  2 Austral…     1 1     1        193      1      98   0.608    0        0            1        0
##  3 Austria      1 0     0        122      0     250   0        0        0            0        0
##  4 Bahamas      0 1     1        313      1      26   0.896    0        0            1        0
##  5 Banglad…     0 1     1        513      0      28   0        0        0.888        1        0
##  6 Barbados     0 1     1        316      1      33   0.868    0        0            1        0
##  7 Belarus      0 1     1        913      0       8   0        0        0.968        0        1
##  8 Belgium      1 0     0        124      0     169   0        0        0.324        0        0
##  9 Belize       0 1     1        339      1      18   0.928    0        0            1        0
## 10 Bolivia      0 0.116 0.116    218      0     174   0        0.304    0            0        0
## # … with 75 more rows, and 63 more variables: legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## #   prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>,
## #   logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>,
## #   frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>,
## #   rgdph <dbl>, trade <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, eduger <dbl>,
## #   spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>, pres <dbl>, lyp <dbl>,
## #   semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, dem_age <dbl>, lat01 <dbl>, …
# bring the "country", "list", "trade", "oecd" columns to the front of the dataset
pt_copy<-pt_copy %>% relocate(country, list, trade, oecd)

# prints updated contents of "pt_copy"
pt_copy
## # A tibble: 85 × 75
##    country     list trade  oecd  pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha
##    <chr>      <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>   <dbl>   <dbl>    <dbl>    <dbl>
##  1 Argentina   257.  18.4     0 0     0        213      0     183   0        0.268    0    
##  2 Australia     0   38.8     1 1     1        193      1      98   0.608    0        0    
##  3 Austria     183   78.3     1 0     0        122      0     250   0        0        0    
##  4 Bahamas       0  102.      0 1     1        313      1      26   0.896    0        0    
##  5 Bangladesh    0   25.4     0 1     1        513      0      28   0        0        0.888
##  6 Barbados      0  116.      0 1     1        316      1      33   0.868    0        0    
##  7 Belarus       0  117.      0 1     1        913      0       8   0        0        0.968
##  8 Belgium     184. 132.      1 0     0        124      0     169   0        0        0.324
##  9 Belize        0  113.      0 1     1        339      1      18   0.928    0        0    
## 10 Bolivia     115.  48.9     0 0.116 0.116    218      0     174   0        0.304    0    
## # … with 75 more rows, and 63 more variables: legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>,
## #   legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>,
## #   govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## #   engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## #   cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>,
## #   eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>,
## #   pres <dbl>, lyp <dbl>, semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, …

4.2 Renaming variables

## Renaming a variable (renames "list" to "party_list")
pt_copy<-pt_copy %>% rename(party_list=list)

# prints updated contents of "pt_copy"
pt_copy
## # A tibble: 85 × 75
##    country   party_list trade  oecd  pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha
##    <chr>          <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>   <dbl>   <dbl>    <dbl>    <dbl>
##  1 Argentina       257.  18.4     0 0     0        213      0     183   0        0.268    0    
##  2 Australia         0   38.8     1 1     1        193      1      98   0.608    0        0    
##  3 Austria         183   78.3     1 0     0        122      0     250   0        0        0    
##  4 Bahamas           0  102.      0 1     1        313      1      26   0.896    0        0    
##  5 Banglade…         0   25.4     0 1     1        513      0      28   0        0        0.888
##  6 Barbados          0  116.      0 1     1        316      1      33   0.868    0        0    
##  7 Belarus           0  117.      0 1     1        913      0       8   0        0        0.968
##  8 Belgium         184. 132.      1 0     0        124      0     169   0        0        0.324
##  9 Belize            0  113.      0 1     1        339      1      18   0.928    0        0    
## 10 Bolivia         115.  48.9     0 0.116 0.116    218      0     174   0        0.304    0    
## # … with 75 more rows, and 63 more variables: legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>,
## #   legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>,
## #   govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## #   engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## #   cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>,
## #   eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>,
## #   pres <dbl>, lyp <dbl>, semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, …

4.3 Sorting a dataset in ascending or descending order with respect to a variable

# sorting in ascending (low to high) order with respect to the "trade" variable
pt_copy<-pt_copy %>% arrange(trade)

# prints updated contents of "pt_copy"
pt_copy
## # A tibble: 85 × 75
##    country   party_list trade  oecd  pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha
##    <chr>          <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>   <dbl>   <dbl>    <dbl>    <dbl>
##  1 Brazil         510.   17.6     0 0     1        223      0     177   0        0.292    0    
##  2 Argentina      257.   18.4     0 0     0        213      0     183   0        0.268    0    
##  3 Japan           67.7  18.8     1 0.867 0.867    158      0     250   0        0        0    
##  4 India            0    21.9     0 1     1        534      1      52   0.792    0        0    
##  5 USA              0    23.0     1 1     1        111      0     250   0        0        0    
##  6 Banglade…        0    25.4     0 1     1        513      0      28   0        0        0.888
##  7 Peru           153.   25.9     0 0     0        293      0     178   0        0.288    0    
##  8 Uganda           0    30.8     0 1     1        746      1      37   0.852    0        0    
##  9 Colombia       157.   34.8     0 0     0        233      0     189   0        0.244    0    
## 10 Pakistan         0    38.7     0 1     1        564      1      52   0.792    0        0    
## # … with 75 more rows, and 63 more variables: legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>,
## #   legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>,
## #   govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## #   engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## #   cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>,
## #   eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>,
## #   pres <dbl>, lyp <dbl>, semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, …
# sorting in descending (high to low) order with respect to the "trade" variable
pt_copy<-pt_copy %>% arrange(desc(trade))

# prints updated contents of "pt_copy"
pt_copy
## # A tibble: 85 × 75
##    country   party_list trade  oecd  pind pindo ctrycd col_uk t_indep col_uka col_espa col_otha
##    <chr>          <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>   <dbl>   <dbl>    <dbl>    <dbl>
##  1 Singapore        0    343.     0     1     1    576      1      34   0.864        0    0.864
##  2 Malta           65.9  190.     0     0     1    181      1      35   0.860        0    0    
##  3 Luxembou…       60    189.     1     0     1    137      0     160   0            0    0.360
##  4 Malaysia         0    176.     0     1     1    548      1      42   0.832        0    0    
##  5 Estonia        101    154.     0     0     1    939      0       8   0            0    0.968
##  6 Belgium        184.   132.     1     0     0    124      0     169   0            0    0.324
##  7 Ireland        166    129.     1     0     1    178      1      78   0.688        0    0    
##  8 Mauritius        0    128.     0     1     1    684      1      31   0.876        0    0    
##  9 St. Vinc…        0    123.     0     1     1    364      1      20   0.920        0    0    
## 10 Jamaica          0    122.     0     1     1    343      1      37   0.852        0    0    
## # … with 75 more rows, and 63 more variables: legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>,
## #   legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, catho80 <dbl>, confu <dbl>, avelf <dbl>,
## #   govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## #   engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## #   cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>,
## #   eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>,
## #   pres <dbl>, lyp <dbl>, semi <dbl>, majpar <dbl>, majpres <dbl>, propres <dbl>, …

4.4 Creating new variables based on existing variables

# Create new variable named "non_catholic_80" that is calculated by substracting the Catholic share of the population in 1980 ("catho80") from 100  and relocates "country", "catho80", and the newly created "non_catholic_80" to the front of the dataset
pt_copy<-pt_copy %>% mutate(non_catholic_80=100-catho80) %>% 
                     relocate(country, catho80, non_catholic_80)
# prints updated contents of "pt_copy"
pt_copy
## # A tibble: 85 × 76
##    country     catho80 non_catholic_80 party_list trade  oecd  pind pindo ctrycd col_uk t_indep
##    <chr>         <dbl>           <dbl>      <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>   <dbl>
##  1 Singapore      4.70           95.3         0    343.     0     1     1    576      1      34
##  2 Malta         97.3             2.70       65.9  190.     0     0     1    181      1      35
##  3 Luxembourg    93               7          60    189.     1     0     1    137      0     160
##  4 Malaysia       2.80           97.2         0    176.     0     1     1    548      1      42
##  5 Estonia        2              98         101    154.     0     0     1    939      0       8
##  6 Belgium       90              10         184.   132.     1     0     0    124      0     169
##  7 Ireland       95.3             4.70      166    129.     1     0     1    178      1      78
##  8 Mauritius     31.2            68.8         0    128.     0     1     1    684      1      31
##  9 St. Vincen…   19.4            80.6         0    123.     0     1     1    364      1      20
## 10 Jamaica        9.60           90.4         0    122.     0     1     1    343      1      37
## # … with 75 more rows, and 65 more variables: col_uka <dbl>, col_espa <dbl>, col_otha <dbl>,
## #   legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## #   prot80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>,
## #   yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>,
## #   gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>,
## #   prop65 <dbl>, federal <dbl>, eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>,
## #   seats <dbl>, maj <dbl>, pres <dbl>, lyp <dbl>, semi <dbl>, majpar <dbl>, majpres <dbl>, …

4.5 Selecting or deleting variables

4.5.1 Selecting variables

# Selects "country", "cgexp", "cgrev", and "trade" variables from the "pt_copy" dataset
pt_copy %>% select(country, cgexp, cgrev, trade)
## # A tibble: 85 × 4
##    country       cgexp cgrev trade
##    <chr>         <dbl> <dbl> <dbl>
##  1 Singapore      18.5  34.7  343.
##  2 Malta          41.0  35.0  190.
##  3 Luxembourg     40.2  45.5  189.
##  4 Malaysia       24.5  26.8  176.
##  5 Estonia        30.0  31.1  154.
##  6 Belgium        47.9  43.7  132.
##  7 Ireland        38.1  34.8  129.
##  8 Mauritius      22.5  21.6  128.
##  9 St. Vincent&G  34.8  28.7  123.
## 10 Jamaica        NA    NA    122.
## # … with 75 more rows
# Selects "country", "cgexp", "cgrev", and "trade" variables from the "pt_copy" dataset and assigns the selection to a new object named "pt_copy_selection"
pt_copy_selection<-pt_copy %>% select(country, cgexp, cgrev, trade)
# Prints Contents of "pt_copy_selection"
pt_copy_selection
## # A tibble: 85 × 4
##    country       cgexp cgrev trade
##    <chr>         <dbl> <dbl> <dbl>
##  1 Singapore      18.5  34.7  343.
##  2 Malta          41.0  35.0  190.
##  3 Luxembourg     40.2  45.5  189.
##  4 Malaysia       24.5  26.8  176.
##  5 Estonia        30.0  31.1  154.
##  6 Belgium        47.9  43.7  132.
##  7 Ireland        38.1  34.8  129.
##  8 Mauritius      22.5  21.6  128.
##  9 St. Vincent&G  34.8  28.7  123.
## 10 Jamaica        NA    NA    122.
## # … with 75 more rows

4.5.2 Deleting variables

# Deletes "cgrev" variable from "pt_copy_selection" dataset
pt_copy_selection %>% select(-cgrev)
## # A tibble: 85 × 3
##    country       cgexp trade
##    <chr>         <dbl> <dbl>
##  1 Singapore      18.5  343.
##  2 Malta          41.0  190.
##  3 Luxembourg     40.2  189.
##  4 Malaysia       24.5  176.
##  5 Estonia        30.0  154.
##  6 Belgium        47.9  132.
##  7 Ireland        38.1  129.
##  8 Mauritius      22.5  128.
##  9 St. Vincent&G  34.8  123.
## 10 Jamaica        NA    122.
## # … with 75 more rows
# Deletes "cgrev" AND "cgexp" variables from "pt_copy_selection" dataset
pt_copy_selection %>% select(-c(cgexp, cgrev))
## # A tibble: 85 × 2
##    country       trade
##    <chr>         <dbl>
##  1 Singapore      343.
##  2 Malta          190.
##  3 Luxembourg     189.
##  4 Malaysia       176.
##  5 Estonia        154.
##  6 Belgium        132.
##  7 Ireland        129.
##  8 Mauritius      128.
##  9 St. Vincent&G  123.
## 10 Jamaica        122.
## # … with 75 more rows
# Deletes "cgrev" AND "cgexp" variables from "pt_copy_selection" dataset and assigns the result to a new object named "pt_copy_trade"
pt_copy_trade<-pt_copy_selection %>% select(-c(cgexp, cgrev))
# Prints contents of "pt_copy_trade_revexp"
pt_copy_trade
## # A tibble: 85 × 2
##    country       trade
##    <chr>         <dbl>
##  1 Singapore      343.
##  2 Malta          190.
##  3 Luxembourg     189.
##  4 Malaysia       176.
##  5 Estonia        154.
##  6 Belgium        132.
##  7 Ireland        129.
##  8 Mauritius      128.
##  9 St. Vincent&G  123.
## 10 Jamaica        122.
## # … with 75 more rows
# Deletes "cgrev" AND "cgexp" variables from "pt_copy_selection" dataset and assigns the result to "pt_copy_selection", thereby overwriting the existing version of "pt_copy_selection" with a new version that reflects these deletions
pt_copy_selection<-pt_copy_selection %>% select(-c(cgexp, cgrev))
# prints updated contents of "pt_copy_selection"
pt_copy_selection
## # A tibble: 85 × 2
##    country       trade
##    <chr>         <dbl>
##  1 Singapore      343.
##  2 Malta          190.
##  3 Luxembourg     189.
##  4 Malaysia       176.
##  5 Estonia        154.
##  6 Belgium        132.
##  7 Ireland        129.
##  8 Mauritius      128.
##  9 St. Vincent&G  123.
## 10 Jamaica        122.
## # … with 75 more rows

4.6 Recoding variables

4.6.1 Creating Dummy Variables from Continuous Numeric Variables

# Creates a new dummy variable based on the existing "trade" variable named "trade_open" (which takes on a value of "1" if "trade" is greater than or equal to 77, and 0 otherwise) and then moves the newly created variable to the front of the dataset along with "country" and "trade"; all changes are assigned to "pt_copy", thereby overwriting the existing version of "pt_copy"

pt_copy<-pt_copy %>% mutate(trade_open=ifelse(trade>=77, 1, 0)) %>% 
                     relocate(country, trade_open, trade)
# prints updated contents of "pt_copy"; note the newly created dummy variable
pt_copy
## # A tibble: 85 × 77
##    country  trade_open trade catho80 non_catholic_80 party_list  oecd  pind pindo ctrycd col_uk
##    <chr>         <dbl> <dbl>   <dbl>           <dbl>      <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>
##  1 Singapo…          1  343.    4.70           95.3         0       0     1     1    576      1
##  2 Malta             1  190.   97.3             2.70       65.9     0     0     1    181      1
##  3 Luxembo…          1  189.   93               7          60       1     0     1    137      0
##  4 Malaysia          1  176.    2.80           97.2         0       0     1     1    548      1
##  5 Estonia           1  154.    2              98         101       0     0     1    939      0
##  6 Belgium           1  132.   90              10         184.      1     0     0    124      0
##  7 Ireland           1  129.   95.3             4.70      166       1     0     1    178      1
##  8 Mauriti…          1  128.   31.2            68.8         0       0     1     1    684      1
##  9 St. Vin…          1  123.   19.4            80.6         0       0     1     1    364      1
## 10 Jamaica           1  122.    9.60           90.4         0       0     1     1    343      1
## # … with 75 more rows, and 66 more variables: t_indep <dbl>, col_uka <dbl>, col_espa <dbl>,
## #   col_otha <dbl>, legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>,
## #   legor_sc <dbl>, prot80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>,
## #   logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>,
## #   frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>,
## #   rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, eduger <dbl>, spropn <dbl>,
## #   yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>, pres <dbl>, lyp <dbl>, semi <dbl>, …

4.6.2 Creating categorical variables from continuous numeric variables

# Creates a new variable in the "pt_copy" dataset named "trade_level" (that is coded as "Low Trade" when the "trade" variable is greater than 15 and less than 50, coded as "Intermediate Trade" when "trade" is greater than or equal to 50 and less than 100, and coded as "High TradE" when "trade" is greater than or equal to 100), and then reorders the dataset such that "country", "trade_level", and "trade" are the first three variables in the dataset
pt_copy<-pt_copy %>% mutate(trade_level=case_when(trade>15 & trade<50~"Low_Trade",
                                                  trade>=50 & trade<100~"Intermediate_Trade",
                                                  trade>=100~"High_Trade")) %>% 
                    relocate(country, trade_level, trade)
# prints updated contents of "pt_copy"; note the newly created categorical variable
pt_copy
## # A tibble: 85 × 78
##    country    trade_level trade trade_open catho80 non_catholic_80 party_list  oecd  pind pindo
##    <chr>      <chr>       <dbl>      <dbl>   <dbl>           <dbl>      <dbl> <dbl> <dbl> <dbl>
##  1 Singapore  High_Trade   343.          1    4.70           95.3         0       0     1     1
##  2 Malta      High_Trade   190.          1   97.3             2.70       65.9     0     0     1
##  3 Luxembourg High_Trade   189.          1   93               7          60       1     0     1
##  4 Malaysia   High_Trade   176.          1    2.80           97.2         0       0     1     1
##  5 Estonia    High_Trade   154.          1    2              98         101       0     0     1
##  6 Belgium    High_Trade   132.          1   90              10         184.      1     0     0
##  7 Ireland    High_Trade   129.          1   95.3             4.70      166       1     0     1
##  8 Mauritius  High_Trade   128.          1   31.2            68.8         0       0     1     1
##  9 St. Vince… High_Trade   123.          1   19.4            80.6         0       0     1     1
## 10 Jamaica    High_Trade   122.          1    9.60           90.4         0       0     1     1
## # … with 75 more rows, and 68 more variables: ctrycd <dbl>, col_uk <dbl>, t_indep <dbl>,
## #   col_uka <dbl>, col_espa <dbl>, col_otha <dbl>, legor_uk <dbl>, legor_so <dbl>,
## #   legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, confu <dbl>, avelf <dbl>,
## #   govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## #   engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## #   cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>,
## #   eduger <dbl>, spropn <dbl>, yearele <dbl>, yearreg <dbl>, seats <dbl>, maj <dbl>, …

4.6.3 Creating dummmy variables from categorical variables

# Creates dummy variables from "trade_level" column, and relocates the new dummies to the front of the dataset
pt_copy<-pt_copy %>% dummy_cols("trade_level") %>% 
                      relocate(country, trade_level, trade_level_High_Trade, trade_level_Intermediate_Trade, trade_level_Low_Trade)
# Prints contents of "pt_copy"
pt_copy
## # A tibble: 85 × 81
##    country      trade_level trade_level_Hig… trade_level_Int… trade_level_Low… trade trade_open
##    <chr>        <chr>                  <int>            <int>            <int> <dbl>      <dbl>
##  1 Singapore    High_Trade                 1                0                0  343.          1
##  2 Malta        High_Trade                 1                0                0  190.          1
##  3 Luxembourg   High_Trade                 1                0                0  189.          1
##  4 Malaysia     High_Trade                 1                0                0  176.          1
##  5 Estonia      High_Trade                 1                0                0  154.          1
##  6 Belgium      High_Trade                 1                0                0  132.          1
##  7 Ireland      High_Trade                 1                0                0  129.          1
##  8 Mauritius    High_Trade                 1                0                0  128.          1
##  9 St. Vincent… High_Trade                 1                0                0  123.          1
## 10 Jamaica      High_Trade                 1                0                0  122.          1
## # … with 75 more rows, and 74 more variables: catho80 <dbl>, non_catholic_80 <dbl>,
## #   party_list <dbl>, oecd <dbl>, pind <dbl>, pindo <dbl>, ctrycd <dbl>, col_uk <dbl>,
## #   t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>, legor_uk <dbl>,
## #   legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, confu <dbl>,
## #   avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## #   engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## #   cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, …

4.7 Subsetting rows data based on criteria

4.7.1 The filter function

# Extracts OECD observations in "pt_copy" and assigns to object named "oecd_countries"
oecd_countries<-pt_copy %>% filter(oecd==1) %>% 
                            relocate(country, oecd)
# Prints contents of "oecd_countries"
oecd_countries
## # A tibble: 25 × 81
##    country      oecd trade_level       trade_level_Hig… trade_level_Int… trade_level_Low… trade
##    <chr>       <dbl> <chr>                        <int>            <int>            <int> <dbl>
##  1 Luxembourg      1 High_Trade                       1                0                0 189. 
##  2 Belgium         1 High_Trade                       1                0                0 132. 
##  3 Ireland         1 High_Trade                       1                0                0 129. 
##  4 Netherlands     1 High_Trade                       1                0                0 100. 
##  5 Austria         1 Intermediate_Tra…                0                1                0  78.3
##  6 Norway          1 Intermediate_Tra…                0                1                0  72.2
##  7 Switzerland     1 Intermediate_Tra…                0                1                0  68.9
##  8 Portugal        1 Intermediate_Tra…                0                1                0  68.5
##  9 Sweden          1 Intermediate_Tra…                0                1                0  68.1
## 10 Iceland         1 Intermediate_Tra…                0                1                0  66.9
## # … with 15 more rows, and 74 more variables: trade_open <dbl>, catho80 <dbl>,
## #   non_catholic_80 <dbl>, party_list <dbl>, pind <dbl>, pindo <dbl>, ctrycd <dbl>,
## #   col_uk <dbl>, t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>,
## #   legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## #   prot80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>,
## #   yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>,
## #   gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, …
# Extracts observations for which cgrev (central government revenue as % of gdp)>40, and assigns to object named "high_revenues"
high_revenues<-pt_copy %>% filter(cgrev>40) %>% 
                              relocate(country, cgrev)
# Prints contents of "high_revenues"
high_revenues
## # A tibble: 10 × 81
##    country     cgrev trade_level       trade_level_Hig… trade_level_Int… trade_level_Low… trade
##    <chr>       <dbl> <chr>                        <int>            <int>            <int> <dbl>
##  1 Luxembourg   45.5 High_Trade                       1                0                0 189. 
##  2 Belgium      43.7 High_Trade                       1                0                0 132. 
##  3 Netherlands  47.6 High_Trade                       1                0                0 100. 
##  4 Botswana     50.8 Intermediate_Tra…                0                1                0  87.5
##  5 Hungary      45.6 Intermediate_Tra…                0                1                0  73.7
##  6 Norway       41.1 Intermediate_Tra…                0                1                0  72.2
##  7 Sweden       40.8 Intermediate_Tra…                0                1                0  68.1
##  8 Poland       40.3 Low_Trade                        0                0                1  48.3
##  9 France       40.9 Low_Trade                        0                0                1  44.9
## 10 Italy        41.2 Low_Trade                        0                0                1  44.3
## # … with 74 more variables: trade_open <dbl>, catho80 <dbl>, non_catholic_80 <dbl>,
## #   party_list <dbl>, oecd <dbl>, pind <dbl>, pindo <dbl>, ctrycd <dbl>, col_uk <dbl>,
## #   t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>, legor_uk <dbl>,
## #   legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, confu <dbl>,
## #   avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## #   engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## #   ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, eduger <dbl>, …
# Extracts observations for which the "catho80" variable is less than or equal to 50
minority_catholic<-pt_copy %>% filter(catho80<=50) %>% 
                               relocate(country, catho80)
# Prints contents of "minority_catholic"
minority_catholic
## # A tibble: 53 × 81
##    country       catho80 trade_level trade_level_High_… trade_level_Int… trade_level_Low… trade
##    <chr>           <dbl> <chr>                    <int>            <int>            <int> <dbl>
##  1 Singapore        4.70 High_Trade                   1                0                0  343.
##  2 Malaysia         2.80 High_Trade                   1                0                0  176.
##  3 Estonia          2    High_Trade                   1                0                0  154.
##  4 Mauritius       31.2  High_Trade                   1                0                0  128.
##  5 St. Vincent&G   19.4  High_Trade                   1                0                0  123.
##  6 Jamaica          9.60 High_Trade                   1                0                0  122.
##  7 Gambia           1.90 High_Trade                   1                0                0  122.
##  8 Fiji             9    High_Trade                   1                0                0  119.
##  9 Belarus         14    High_Trade                   1                0                0  117.
## 10 Barbados         5.90 High_Trade                   1                0                0  116.
## # … with 43 more rows, and 74 more variables: trade_open <dbl>, non_catholic_80 <dbl>,
## #   party_list <dbl>, oecd <dbl>, pind <dbl>, pindo <dbl>, ctrycd <dbl>, col_uk <dbl>,
## #   t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>, legor_uk <dbl>,
## #   legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, confu <dbl>,
## #   avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## #   engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## #   cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, federal <dbl>, …

Using the & operator

# Extracts federal OECD countries (where oecd=1 AND federal=1) and assigns to a new object named "oecd_federal_countries"
oecd_federal_countries<-pt_copy %>% filter(oecd==1 & federal==1) %>% 
                                      relocate(country, oecd, federal)
# prints contents of "oecd_federal_countries"
oecd_federal_countries
## # A tibble: 7 × 81
##   country     oecd federal trade_level trade_level_Hig… trade_level_Int… trade_level_Low… trade
##   <chr>      <dbl>   <dbl> <chr>                  <int>            <int>            <int> <dbl>
## 1 Austria        1       1 Intermedia…                0                1                0  78.3
## 2 Switzerla…     1       1 Intermedia…                0                1                0  68.9
## 3 Canada         1       1 Intermedia…                0                1                0  66.5
## 4 Germany        1       1 Low_Trade                  0                0                1  48.8
## 5 Mexico         1       1 Low_Trade                  0                0                1  47.5
## 6 Australia      1       1 Low_Trade                  0                0                1  38.8
## 7 USA            1       1 Low_Trade                  0                0                1  23.0
## # … with 73 more variables: trade_open <dbl>, catho80 <dbl>, non_catholic_80 <dbl>,
## #   party_list <dbl>, pind <dbl>, pindo <dbl>, ctrycd <dbl>, col_uk <dbl>, t_indep <dbl>,
## #   col_uka <dbl>, col_espa <dbl>, col_otha <dbl>, legor_uk <dbl>, legor_so <dbl>,
## #   legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>, prot80 <dbl>, confu <dbl>, avelf <dbl>,
## #   govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>, yrsopen <dbl>, gadp <dbl>,
## #   engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>, gastil <dbl>, cgexp <dbl>,
## #   cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, prop65 <dbl>, eduger <dbl>, …

Using the | operator

# Extracts observations that are in Africa ("africa") OR in Asia/Europe ("asiae) and assigns to an object named "asia_europe_africa"
asia_europe_africa<-pt_copy %>% filter(continent=="africa"|continent=="asiae") %>% 
                                  relocate(continent)
# Prints contents of "asia_europe_africa"
asia_europe_africa
## # A tibble: 24 × 81
##    continent country       trade_level trade_level_Hig… trade_level_Int… trade_level_Low… trade
##    <chr>     <chr>         <chr>                  <int>            <int>            <int> <dbl>
##  1 asiae     Singapore     High_Trade                 1                0                0 343. 
##  2 asiae     Malaysia      High_Trade                 1                0                0 176. 
##  3 africa    Mauritius     High_Trade                 1                0                0 128. 
##  4 africa    Gambia        High_Trade                 1                0                0 122. 
##  5 asiae     Fiji          High_Trade                 1                0                0 119. 
##  6 africa    Namibia       High_Trade                 1                0                0 115. 
##  7 asiae     Papua N. Guin High_Trade                 1                0                0 102. 
##  8 asiae     Taiwan        Intermedia…                0                1                0  90.7
##  9 africa    Botswana      Intermedia…                0                1                0  87.5
## 10 asiae     Thailand      Intermedia…                0                1                0  84.9
## # … with 14 more rows, and 74 more variables: trade_open <dbl>, catho80 <dbl>,
## #   non_catholic_80 <dbl>, party_list <dbl>, oecd <dbl>, pind <dbl>, pindo <dbl>,
## #   ctrycd <dbl>, col_uk <dbl>, t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>,
## #   legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## #   prot80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>,
## #   yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>,
## #   gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, …

Filtering for observations that do NOT meet a condition:

# Extracts all non-Africa observations and assigns to object named "pt_copy_sans_africa"
pt_copy_sans_africa<-pt_copy %>% filter(continent!="africa") %>% relocate(continent)
# Prints contents of "pt_copy_sans_africa"
pt_copy_sans_africa
## # A tibble: 74 × 81
##    continent country       trade_level trade_level_Hig… trade_level_Int… trade_level_Low… trade
##    <chr>     <chr>         <chr>                  <int>            <int>            <int> <dbl>
##  1 asiae     Singapore     High_Trade                 1                0                0  343.
##  2 other     Malta         High_Trade                 1                0                0  190.
##  3 other     Luxembourg    High_Trade                 1                0                0  189.
##  4 asiae     Malaysia      High_Trade                 1                0                0  176.
##  5 other     Estonia       High_Trade                 1                0                0  154.
##  6 other     Belgium       High_Trade                 1                0                0  132.
##  7 other     Ireland       High_Trade                 1                0                0  129.
##  8 laam      St. Vincent&G High_Trade                 1                0                0  123.
##  9 laam      Jamaica       High_Trade                 1                0                0  122.
## 10 other     Slovak Repub… High_Trade                 1                0                0  119.
## # … with 64 more rows, and 74 more variables: trade_open <dbl>, catho80 <dbl>,
## #   non_catholic_80 <dbl>, party_list <dbl>, oecd <dbl>, pind <dbl>, pindo <dbl>,
## #   ctrycd <dbl>, col_uk <dbl>, t_indep <dbl>, col_uka <dbl>, col_espa <dbl>, col_otha <dbl>,
## #   legor_uk <dbl>, legor_so <dbl>, legor_fr <dbl>, legor_ge <dbl>, legor_sc <dbl>,
## #   prot80 <dbl>, confu <dbl>, avelf <dbl>, govef <dbl>, graft <dbl>, logyl <dbl>, loga <dbl>,
## #   yrsopen <dbl>, gadp <dbl>, engfrac <dbl>, eurfrac <dbl>, frankrom <dbl>, latitude <dbl>,
## #   gastil <dbl>, cgexp <dbl>, cgrev <dbl>, ssw <dbl>, rgdph <dbl>, prop1564 <dbl>, …