Jigsaw Rate Severity of Toxic Comments(正規表現エスケープ+顔文字+大文字小文字)
置換その2。正規表現のエスケープ+顔文字抽出+大文字小文字。
参考サイト
やってみた
正規表現のエスケープ
最初のきっかけはnotebookそのまま使ったけど置換できてなくない?から。正規表現モジュールre
を使わずに.replaceの中で正規表現のエスケープの記載を入れてもうまくいかないよなと。多分そういうことだと理解。他のNotebookをそのまま使う前にはちょっと確認しないとなと。
tmp = "You, sir, are my hero. Any chance you remember what page that's on?" tmp1 = tmp.replace(r"\'s", " ") tmp2 = re.sub(r"\'s", " ", tmp) print(f"tmp:{tmp}") # tmp:You, sir, are my hero. Any chance you remember what page that's on? print(f"tmp1:{tmp1}") # tmp1:You, sir, are my hero. Any chance you remember what page that's on? print(f"tmp2:{tmp2}") # tmp2:You, sir, are my hero. Any chance you remember what page that on?
def short_word_rep(text): text = text.replace(r"what's", "what is ") text = text.replace(r"can't", "cannot ") text = text.replace(r"n't", " not ") text = text.replace(r"i'm", "i am ") text = text.replace(r"I'm", "i am ") # text = text.replace(r"\'ve", " have ") # text = text.replace(r"\'re", " are ") # text = text.replace(r"\'d", " would ") # text = text.replace(r"\'ll", " will ") # text = text.replace(r"\'scuse", " excuse ") # text = text.replace(r"\'s", " ") text= re.sub(r"\'s", " ", text) text= re.sub(r"\'ll", " will ", text) text= re.sub(r"\'scuse", " excuse ", text) text= re.sub(r"\'d", " would ", text) text= re.sub(r"\'re'", " are ", text) text= re.sub(r"\'ve'", " have ", text) return text def text_cleaning(text): text = short_word_rep(text) template = re.compile(r'https?://\S+|www\.\S+') text = template.sub(r'', text) soup = BeautifulSoup(text, 'lxml') only_text = soup.get_text() text = only_text emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags = re.UNICODE) text = emoji_pattern.sub(r'', text) # text = re.sub(r"[^a-zA-Z\d]", " ", text) text = re.sub(r"[^a-zA-Z\d|!]", " ", text) text = re.sub(r"!+", r"!", text) text = re.sub(' +', ' ', text) text = text.strip() return text clean_df = df.copy(deep= True) clean_df["text"] = df["text"].apply(text_cleaning)
顔文字
試しに「:(」という顔文字だけを抽出。この顔文字がある52コメンと少ないけど攻撃的なスコア高い!この情報を落とすのはよくにだろう。こんなに低いのであれば、この顔文字があるかも攻撃的スコア自体に組み込んでもよいのでは??とりあえずsad
という単語に変換するけど。。
df["face"] = df["text"].apply(lambda x: re.search(r"\:\(", x)) df["face_flg"] = df["face"].apply(lambda x: "NOT_sad_face" if x is None else "sad_face") agg_d = { "y": ["mean", "count"] } group = df.groupby("face_flg").agg(agg_d) group.columns = ["_".join(i) for i in group.columns] group
tmp = df[df.face_flg == "sad_face"].iloc[0,0] tmp_sub = re.sub(r"\:\(", "sad", tmp) print(f"tmp:{tmp}") print(f"tmp1:{tmp_sub}")
tmp:closedmouth is a DICK-FUCK this guy is a jackass who sux cock 4 a fuckin job y the fuck does the trans do it?????cuz its into tht kinda shit >:( tmp1:closedmouth is a DICK-FUCK this guy is a jackass who sux cock 4 a fuckin job y the fuck does the trans do it?????cuz its into tht kinda shit > sad
逆に笑顔顔文字はスコアが低いわけでもなかった。よって情報なしということで削除してしまう。
df["face"] = df["text"].apply(lambda x: re.search(r"\(\:", x)) df["face_flg"] = df["face"].apply(lambda x: "NOT_happy_face0" if x is None else "happy_face0") # df["face_flg"] = df["face_flg"].mask(df.face is not None, "happy_face0") # df["face"] = df["text"].apply(lambda x: re.search(r"\:\)", x)) # # df["face_flg"] = df["face"].apply(lambda x: x if x is None else "happy_face1") # # df["face_flg"] = df["face"].apply(lambda x: "NOT_happy_face" if x is None else "happy_face") # df["face_flg"] = df["face_flg"].mask(df.face is not None, "happy_face1") agg_d = { "y": ["mean", "count"] } group = df.groupby("face_flg").agg(agg_d) group.columns = ["_".join(i) for i in group.columns] group
大文字小文字
GOODとgoodだったらなんとなく大文字の方が強い表現に見えるので。単語内に2文字以上大文字があったら大文字に、1文字以下であれば小文字にする。
text = "He is Very VeRy GOOD boy YEah" text = text.split(" ") text = [str.upper(i) if len(re.findall(r"[A-Z]", i)) >= 2 else str.lower(i) for i in text] text = " ".join(text) text # 'he is very VERY GOOD boy YEAH'
実際、大文字が10文字以上あるテキストの攻撃性y
は通常のものより高そう。
tmp = clean_df.copy(deep= True) tmp["upper_flg"] = tmp["text"].apply(lambda x: "zero" if len(re.findall(r"[A-Z]", x)) <= 10 else "NOT_zero") agg_d = { "y": ["mean", "std", "count"] } tmp.groupby("upper_flg").agg(agg_d)