Skip to content

Commit b6cc04d

Browse files
authored
Merge pull request #8 from UCSB-Library-Research-Data-Services/textanalysis
Textanalysis
2 parents a0ad0ba + 04ac502 commit b6cc04d

File tree

1 file changed

+91
-0
lines changed

1 file changed

+91
-0
lines changed

scripts/workbook.qmd

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
2+
## Import Libraries
3+
4+
```{r}
5+
library(tidyverse)
6+
library(tidytext) # text mining
7+
library(stringr) # string manipulation
8+
library(stringi) # string operations (emoji replacement)
9+
library(dplyr) # data wrangling
10+
library(textclean) # expand contractions
11+
library(emo)
12+
library(textstem) # lemmatization
13+
```
14+
15+
### Read comments data
16+
17+
```{r}
18+
comments <- readr::read_csv("../data/raw/comments.csv")
19+
20+
```
21+
22+
### Patterns
23+
24+
```{r}
25+
apostrophes <- ("[‘’‛ʼ❛❜'`´′]")
26+
url_pattern <- "http[s]?://[^\\s,]+|www\\.[^\\s,]+"
27+
hidden_characters <- "[\u00A0\u2066\u2067\u2068\u2069]"
28+
mentions <- "@[A-Za-z0-9_]+"
29+
punctuation <- "[[:punct:]“”‘’–—…|+]"
30+
numbers <- "[[:digit:]]"
31+
repeated_chars <- "(.)\\1{2,}"
32+
hashtag_splitter <- "(?<![#@])([a-z])([A-Z])"
33+
```
34+
35+
### Create emoji dictionary
36+
37+
```{r}
38+
emoji_dict <- emo::jis[, c("emoji", "name")]
39+
emoji_dict <- emoji_dict %>% add_row("emoji" = "🧇", "name" = "waffle")
40+
emoji_dict
41+
```
42+
43+
### Clean emojis function
44+
45+
```{r}
46+
replace_emojis <- function(text, emoji_dict) {
47+
stri_replace_all_fixed(
48+
str = text, # The text to process
49+
pattern = emoji_dict$emoji, # The emojis to find
50+
replacement = paste0(emoji_dict$name, " "), # Their corresponding names
51+
vectorize_all = FALSE # element-wise replacement in a same string
52+
)
53+
}
54+
```
55+
56+
### Clean data
57+
58+
Control characters
59+
60+
`\u00A0` → non-breaking space
61+
`\u2066` → LEFT-TO-RIGHT ISOLATE (LRI)
62+
`\u2067` → RIGHT-TO-LEFT ISOLATE (RLI)
63+
`\u2068` → FIRST STRONG ISOLATE (FSI)
64+
`\u2069` → POP DIRECTIONAL ISOLATE (PDI)
65+
66+
67+
```{r}
68+
comments_clean <- comments %>%
69+
mutate(
70+
clean_text = text %>%
71+
str_replace_all(url_pattern, "") %>% # url pattern goes first
72+
str_replace_all(hidden_characters, " ") %>%
73+
str_replace_all(apostrophes, "'") %>%
74+
replace_contraction() %>%
75+
str_replace_all(hashtag_splitter, "\\1 \\2") %>% # optional
76+
str_to_lower() %>%
77+
str_replace_all(mentions, "") %>%
78+
str_replace_all(punctuation, " ") %>%
79+
str_replace_all("[[:digit:]]+", " ") %>%
80+
str_squish() %>%
81+
str_replace_all(repeated_chars, "\\1") %>%
82+
replace_emojis(emoji_dict)
83+
)
84+
```
85+
86+
87+
### Save CSV
88+
89+
```{r}
90+
write_csv(comments_clean, "../data/clean/comments_clean.csv")
91+
```

0 commit comments

Comments
 (0)