Day 21 ] Project2 유사도 분석

2019. 5. 29. 22:14

1. 파일 읽어오는 함수

# 내 파일 읽어오기

from os import listdir
def fileids(path, ext="txt"):
    return [path+file for file in listdir(path) if "사회" in file]

def filecontent(file):
    with open(file, encoding="utf-8") as fp:
        content = fp.read()
    return content

def ngram(term, n=2):
    return [term[i:i+n] for i in range(len(term) - n + 1)]

2. 불필요한 문자제거용 정규표현식

# 불필요한 문자열 제거
# 정규식, 길이, 구두점 제거
from string import punctuation
import re

pattern = dict()

# 구두점
pattern1 = re.compile(r"[{0}]".format(re.escape(punctuation)))
# corpus = pattern1.sub(" ", corpus)
pattern["punc"] = pattern1

# 불용어
pattern2 = re.compile(r"[A-Za-z0-9]{7,}")
# corpus = pattern2.sub(" ", corpus)
pattern["stopword"] = pattern2

# 이메일
pattern3 = re.compile(r"\w{2,}@\w{3,}(.\w{2,})+")
# corpus = pattern3.sub(" ", corpus)
pattern["email"] = pattern3

# 도메인
pattern4 = re.compile(r"(.?\w{2,}){2,}")
# corpus = pattern4.sub(" ", corpus)
pattern["domain"] = pattern4

# 한글, 숫자만 남기기
pattern5 = re.compile(r"[^가-힣0-9]+")
# corpus = pattern5.sub(" ", corpus)
pattern["nonkorean"] = pattern5

# 반복되는 공백문자
pattern6 = re.compile(r"\s")
# corpus = pattern6.sub(" ", corpus)
pattern["whitespace"] = pattern6

3. DTM만들기

from collections import defaultdict # 키가 없을 때 에러가 없도록 하기 위함
# 전체 처리해보기

DTM = defaultdict(lambda: defaultdict(int))
for file in fileids("./scraping/"):
    with open(file, encoding="utf-8") as fp:
        content = fp.read()
    # 1. 문자열 가져오기
    indexTerm1 = defaultdict(int)
    indexTerm2 = defaultdict(int)
    indexTerm3 = defaultdict(int)
    indexTerm4 = defaultdict(int)
    indexTerm5 = defaultdict(int)
    indexTerm = defaultdict(int)

    # 2. 불필요한 문자 제거
    for _ in ["email", "punc", "stopword", "whitespace"]:
        content = pattern[_].sub(" ", content)

    # 3. 특징점 분리
    for term in word_tokenize(content):
        indexTerm1[term] += 1

    for _ in indexTerm1:
        for t in ma.pos(_):
            indexTerm2[t] += 1 # 원시형태소 + 품사
            DTM[file][t] += 1
            if len(t[0]) > 1: # 음절 길이로 정규화
                indexTerm3[t[0]] += 1 # 원시형태소
                DTM[file][t[0]] += 1
            if t[1].startswith("N"):
                indexTerm4[t[0]] += 1 # 명사
                DTM[file][t[0]] += 1
            for n in ngram(t[0]): # N그램(바이그램)
                indexTerm5[n] += 1
                DTM[file][n] += 1
#            DTM[file][term] += 1

4. TDM 만들기

- DTM의 역순( document이름, 단어 => 단어, document이름 )

TDM = defaultdict(lambda: defaultdict(int))
for idx, termList in DTM.items():
for term, freq in termList.items():
TDM[term][idx] = freq

5. 문서의 벡터값 구하기

- 각 단어에 대한 다차원에서 벡터값을 구한다

from math import log2, sqrt

TWM = defaultdict(lambda:defaultdict(float))
N = len(DTM)

DVL = defaultdict(float)
for idx, termList in DTM.items():
    maxTF = max(termList.values())
    for term, freq in termList.items():
        # 데이터 벡터
        TF =  freq/maxTF
        IDF = log2(N/len(TDM[term]))
        TWM[term][idx] = TF*IDF
        DVL[idx] += TWM[term][idx]**2

for idx, length in DVL.items():
    DVL[idx] = sqrt(length)

6. Query도 모든 1~5번과정을 거치기

from nltk.tokenize import sent_tokenize
query = '서울시에 거래되는 아파트의 전세값은'

TQM = defaultdict(int)
QWM = defaultdict(float)

# 1. 문자열 가져오기
indexTerm1 = defaultdict(int)
indexTerm2 = defaultdict(int)
indexTerm3 = defaultdict(int)
indexTerm4 = defaultdict(int)
indexTerm5 = defaultdict(int)
QDTM = defaultdict(int)

# 2. 불필요한 문자 제거
for _ in ["email", "punc", "stopword", "whitespace"]:
    query = pattern[_].sub(" ", query)

# 3. 특징점 분리
for _ in word_tokenize(query):
    for t in ma.pos(_):
        indexTerm2[t] += 1 # 원시형태소 + 품사
        QDTM[t] += 1
        if len(t[0]) > 1: # 음절 길이로 정규화
            indexTerm3[t[0]] += 1 # 원시형태소
            QDTM[t[0]] += 1
        if t[1].startswith("N"):
            indexTerm4[t[0]] += 1 # 명사
            QDTM[t[0]] += 1
        for n in ngram(t[0]): # N그램(바이그램)
            indexTerm5[n] += 1
            QDTM[n] += 1
        QDTM[_] += 1
print(len(QDTM))

# QDTM => TQM
for term, freq in QDTM.items():
    TQM[term] = freq

# TQM => QWM
alpha = 0.5
maxTF = max(TQM.values())
for term, ferq in TQM.items():
    TF = alpha + (1-alpha)*(freq/maxTF)
    DF = len(TWM[term]) if len(TWM[term]) > 0 else 1
    IDF = log2(N/DF)
    QWM[term] = TF*IDF

# 유사도
candidateList = defaultdict(float)
for term, weight1 in QWM.items():
    for doc, weight2 in TWM[term].items():
        innerProduct = weight1 * weight2
        candidateList[doc] += innerProduct

for doc, sim in candidateList.items():
    candidateList[doc] = sim/DVL[doc]

7. 벡터 유사도 출력
K = 10
for doc, sim in sorted(candidateList.items(), key=lambda x:x[1], reverse=True)[:K]:
    print('문서이름:{0} / 유사도:{1:.4f}'.format(doc, sim))
    #print(sent_tokenize(kobill.open(doc).read())[:3])
    print()
    print(sent_tokenize(filecontent(doc))[:3])

8. 유클리디언 거리 구하기

candidateList = defaultdict(float)
for term, docList in TWM.items():
    for doc, weight1 in docList.items():
        weight2 = QTWM[term]
        candidateList[doc] += (weight1 - weight2) ** 2

for doc, sim in candidateList.items():
    candidateList[doc] = sqrt(sim)

9. 유클리디언 거리 출력하기

from nltk.tokenize import sent_tokenize
K = 5
for doc, sim in sorted(candidateList.items(), key=lambda x:x[1], reverse=False)[:K]:
    print("문서이름:{0} / 거리:{1:.4f}".format(doc, sim))
    print(sent_tokenize(filecontent(doc))[:5])
    print()

'데이터 분석가 역량' 카테고리의 다른 글

Day 25 ] Naive Bayes (0)	2019.06.05
Day 22 ] Classification (0)	2019.05.30
Day 20] 내 파일로 분석해보기 (0)	2019.05.29
Day 19 ] Deep Learning (0)	2019.05.27
Day 18 ] 정보검색 - Vector 공간 (0)	2019.05.27

Stack Writing

Day 21 ] Project2 유사도 분석

'데이터 분석가 역량' 카테고리의 다른 글

+ Recent posts

티스토리툴바