Membuat Deteksi Plagiasi Dengan Python

Dataset

Kodingan

Crawl Dokumen Dengan Glob

import glob
filedok = glob.glob('datasets/*.txt')
text_file = [open(file).read() for file in filedok]print(filedok)
print('='*48)
print(text_file[0])
['datasets/siswa3.txt', 'datasets/siswa2.txt', 'datasets/siswa1.txt']
================================================
The dimension of the smartphone is 164.9 x 75.1 x 8.5 mm and it weighs 188 grams. It is powered by Qualcomm SM4250 Snapdragon 460 processor and comes in 6.52 inches IPS LCD, which is protected by Corning Gorilla Glass 3.
from sklearn.feature_extraction.text import TfidfVectorizer
#mennggunakan tfidf untuk pembobotan
vectorizer = TfidfVectorizer()
vec = vectorizer.fit_transform(text_file).toarray()
['164', '188', '460', '52', '75', 'ago', 'an', 'and', 'another', 'birds', 'british', 'by', 'comes', 'corning', 'creation', 'creatures', 'dimension', 'farmer', 'fishes', 'for', 'from', 'glass', 'gorilla', 'goverment', 'government', 'grams', 'happy', 'here', 'history', 'home', 'hundred', 'immaculate', 'in', 'inches', 'innumerable', 'ips', 'is', 'islands', 'it', 'lay', 'lcd', 'lions', 'lived', 'long', 'millions', 'mm', 'more', 'no', 'of', 'other', 'powered', 'predestined', 'processor', 'protected', 'qualcomm', 'quite', 'republican', 'resort', 'sea', 'sm4250', 'smartphone', 'snapdragon', 'sort', 'store', 'the', 'there', 'these', 'times', 'to', 'up', 'was', 'wealth', 'weighs', 'were', 'when', 'which', 'world', 'written']
[0.         0.         0.         0.         0.         0.09769707
0.09769707 0.15174098 0.09769707 0.09769707 0.09769707 0.
0. 0. 0.09769707 0.19539414 0. 0.09769707
0.09769707 0.19539414 0.09769707 0. 0. 0.12845991
0. 0. 0.09769707 0.09769707 0.09769707 0.09769707
0.09769707 0.09769707 0. 0. 0.09769707 0.
0. 0.09769707 0. 0.09769707 0. 0.09769707
0.09769707 0.09769707 0.19539414 0. 0.09769707 0.09769707
0.53109344 0.09769707 0. 0.09769707 0. 0.
0. 0.09769707 0. 0.09769707 0.09769707 0.
0. 0. 0.09769707 0.19539414 0.37935246 0.09769707
0.09769707 0.09769707 0.09769707 0.09769707 0.09769707 0.09769707
0. 0.09769707 0.09769707 0. 0.09769707 0.09769707]
vec_list = list(zip(filedok, vec))
from sklearn.metrics.pairwise import cosine_similarityplag =set()
for siswa, text_vector in vec_list:
new_vec = vec_list.copy()
indexx = new_vec.index((siswa, text_vector))
del new_vec[indexx]
for siswa_a, text_vector_a in new_vec:
sim = cosine_similarity([text_vector, text_vector_a])[0][1]
student_pair = sorted((siswa, siswa_a))
score = (student_pair[0],student_pair[1], "{:.1f}".format(sim*100)+'%')
plag.add(score)
for x in plag:
print(x)
('siswa1.txt', 'siswa2.txt', '97.5%')
('siswa1.txt', 'siswa3.txt', '13.9%')
('siswa2.txt', 'siswa3.txt', '14.0%')

Referensi

Tetap Terhubung dengan Kami
Share this
×