Commit 8f565798 by Yolanda Nainggolan

added searching, fixed result and lyrics

parent 2d25e3c9
......@@ -2,19 +2,86 @@ resource_package = __name__
import string
import re
import collections
import math
import pandas as pd
import json
import xml.dom.minidom as minidom
import xml.etree.ElementTree as et
from xml.etree.ElementTree import ElementTree
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from itertools import count
import collections
import math
import xml.etree.ElementTree as et
from xml.etree.ElementTree import ElementTree
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
##############Show Dataframe########################
def show_dataframe(parse_data):
data = parse_data.getroot()
df_cols = ["DOCNO", "SONG", "ARTIST", "LYRICS"]
rows = []
for node in data:
s_docno = node.find("DOCNO").text if node is not None else None
s_song = node.find("SONG").text if node is not None else None
s_artist = node.find("ARTIST").text if node is not None else None
s_lyrics = node.find("LYRICS").text if node is not None else None
rows.append({"DOCNO": s_docno, "SONG": s_song, "ARTIST": s_artist, "LYRICS": s_lyrics})
DataFrame = pd.DataFrame(rows, columns = df_cols)
dictionary = DataFrame.set_index('DOCNO').T.to_dict('list')
##############Remove Punctuation, URL and Tokenize###################
nilai = list(dictionary.values())
nomornya = list(dictionary.keys())
for i in range(0, len(nomornya)):
nomornya[i] = int(nomornya[i])
lagunya = [sublist[0] for sublist in nilai]
artisnya = [sublist[1] for sublist in nilai]
liriknya = [sublist[2] for sublist in nilai]
context = {"DOCNO": nomornya, "SONG": lagunya, "ARTIST": artisnya, "LYRICS": liriknya}
return context
##############N_DOC########################
def data_var(tree):
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_lyrics = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_lyrics.append(node.text)
N_DOC = len(all_lyrics)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_lyrics[i])
return all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc
##############Remove Punctuation###################
def remove_punc_tokenize(sentence):
tokens = []
for punctuation in string.punctuation:
sentence = sentence.replace(punctuation," ")
......@@ -30,23 +97,79 @@ def to_lower(tokens):
tokens = [x.lower() for x in tokens]
return tokens
def generate_ngrams(data, n):
ngram=[]
result = []
##############Load Data########################
def load_data(dcmnt_xml):
all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
all_profile = dcmnt_xml.getElementsByTagName('SONG')
all_date = dcmnt_xml.getElementsByTagName('ARTIST')
all_text = dcmnt_xml.getElementsByTagName('LYRICS')
all_pub = dcmnt_xml.getElementsByTagName('PUB')
all_page = dcmnt_xml.getElementsByTagName('PAGE')
N_DOC = len(all_doc_no)
all_sentence_doc_sample = []
for i in range(N_DOC):
sentence_doc_sample = ' '+ all_text[i].firstChild.data
all_sentence_doc_sample.append(sentence_doc_sample)
return all_doc_no, N_DOC, all_sentence_doc_sample
##############Indexing########################
def indexing(N_DOC, tokens_doc, all_doc_no):
all_tokens = []
for i in range(N_DOC):
for w in tokens_doc[i]:
all_tokens.append(w)
new_sentence = ' '.join([w for w in all_tokens])
for w in CountVectorizer().build_tokenizer()(new_sentence):
all_tokens.append(w)
all_tokens = set(all_tokens)
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
for n in range(N_DOC):
if(token in tokens_doc[n]):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
indexnya = json.loads(json.dumps(proximity_index))
words = indexnya.keys()
freq = indexnya.values()
freq = list(freq)
hasil = {}
for key in words:
for value in freq:
hasil[key] = value
freq.remove(value)
break
#menampilkan hasil n-gram per dokumen
for i in range(len(data)):
sequences = [data[i][j:] for j in range(n)]
temp = zip(*sequences)
lst = list(temp)
result.append([" ".join(lst) for lst in lst])
numb = []
idx = []
for i, j in hasil.items():
numb.append(i)
idx.append(j)
res = {}
for key in numb:
for value in idx:
res[key] = value
idx.remove(value)
break
return res
#menggabungkan n-gram semua dokumen dalam bentuk array
for i in range(len(result)):
for j in range(len(result[i])):
ngram.append(result[i][j])
return ngram, result
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
......@@ -64,30 +187,19 @@ def stemming(tokens):
return tokens
def searching(dcmnt_xml, query):
def main(query):
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
all_song = dcmnt_xml.getElementsByTagName('SONG')
all_lyrics = dcmnt_xml.getElementsByTagName('LYRICS')
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
N_DOC = len(all_doc_no)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
sentence_doc = all_song[i].firstChild.data +' '+ all_lyrics[i].firstChild.data
all_sentence_doc.append(sentence_doc)
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i]))
......@@ -95,158 +207,118 @@ def main(query):
for i in range(N_DOC):
tokens_doc[i] = to_lower(tokens_doc[i])
stop_words = set(stopwords.words('english'))
stopping = []
for i in range(N_DOC):
temp = []
for j in tokens_doc[i]:
if j not in stop_words:
temp.append(j)
stopping.append(temp)
tokens_doc[i] = stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in stopping[i] if not any(j.isdigit() for j in w)])
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
factory = StemmerFactory()
stemmer = factory.create_stemmer()
for i in range(N_DOC):
tokens_doc[i] = stemming(tokens_doc[i])
stemming = []
all_tokens =[]
for i in range(N_DOC):
temp=[]
for j in tokens_doc[i]:
# print(j)
temp.append(stemmer.stem(j))
stemming.append(temp)
all_tokens.append(j)
all_tokens = []
for i in range(N_DOC):
for w in stemming[i]:
all_tokens.append(w)
new_sentences = ' '.join([w for w in all_tokens])
new_sentence = ' '.join([w for w in all_tokens])
for w in CountVectorizer().build_tokenizer()(new_sentence):
all_tokens.append(w)
for j in CountVectorizer().build_tokenizer()(new_sentences):
all_tokens.append(j)
all_tokens = set(all_tokens)
alls = []
for i in all_tokens:
alls.append(i)
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
for n in range(N_DOC):
if(token in tokens_doc[n]):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
queri=[]
spl = query.split()
for i in range(len(spl)):
if not spl[i].isdigit():
queri.append(spl[i])
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
punc = []
for i in range(len(queri)):
no_punc = ""
for j in range(len(queri[i])):
if queri[i][j] not in string.punctuation:
no_punc = no_punc + queri[i][j]
punc.append(no_punc)
kunci = []
nilai = []
for key, value in proximity_index[query].items():
kunci.append(key)
nilai.append(value)
lower=[]
for i in range(len(punc)):
lower.append(punc[i].lower())
dict = {}
for key in kunci:
for value in nilai:
dict[key] = value
nilai.remove(value)
break
stop = []
for i in range(len(lower)):
if lower[i] not in stop_words:
stop.append(lower[i])
xtree = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
xroot = xtree.getroot()
stem = []
for i in range(len(stop)):
stem.append(stemmer.stem(stop[i]))
df_cols = ["SONG"]
rows = []
join_word = ' '.join([w for w in stem])
for node in xroot:
lirik = node.find("SONG").text if node is not None else None
ngram, ngram_doc = generate_ngrams(stemming, len(stem))
rows.append({"SONG": lirik})
n_gram_index = {}
for ngram_token in ngram:
doc_no = []
for i in range(N_DOC):
if(ngram_token in ngram_doc[i]):
doc_no.append(all_doc_no[i])
n_gram_index[ngram_token] = doc_no
df = pd.DataFrame(rows, columns = df_cols)
df = []
nomor = []
for i in dict:
nomor.append(int(i))
for i in range(N_DOC):
count = 0
for j in range(len(ngram_doc[i])):
if join_word == ngram_doc[i][j]:
count+=1
df.append(count)
idf = []
for i in range(len(df)):
try:
idf.append(math.log10(N_DOC/df[i]))
except ZeroDivisionError:
idf.append(str(0))
#w(t, d)
#t = term
#d = document
wtd = []
l = []
for i in range(N_DOC):
dic = {}
tf = ngram_doc[i].count(join_word) # menghitung nilai tf
if tf != 0:
score = math.log10(tf) #log10(tf(t,d))
score+=1 # 1 + log(tf(t,d))
score*=idf[i] #tf * idf
judul = []
for i in nomor:
judul.append(df['SONG'][i-1])
idx = all_doc_no[i]
judul = all_song[i]
hasil = {}
for key in nomor:
for value in judul:
hasil[key] = value
judul.remove(value)
break
dic['docno'] = idx
dic['judul'] = judul
dic['score'] = score
numb = []
tit = []
l.append(dic)
wtd.append(l) # [i+1] = defenisi nomor dokumen; score = wtd
# print(score)
for i, j in hasil.items():
numb.append(i)
tit.append(j)
hasil = []
hasil.append(sorted(wtd[0], key = lambda x : x['score'], reverse = True))
res = {}
for key in numb:
for value in tit:
res[key] = value
tit.remove(value)
break
return hasil
return res
def detail(nomor):
tree = et()
tree.parse("apps/data/dataset_STBI.xml")
def detail(id):
all_doc_no = []
all_song = []
all_text = []
import pandas as pd
import xml.etree.ElementTree as et
import numpy as np
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
xtree = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
xroot = xtree.getroot()
for node in tree.iter("SONG"):
# all_song.append(node.text.replace("\n"," "))
all_song.append(node.text)
head = all_song
df_cols = ["SONG", "LYRICS"]
rows = []
for node in xroot:
judul = node.find("SONG").text if node is not None else None
lirik = node.find("LYRICS").text if node is not None else None
rows.append({"SONG": judul,
"LYRICS":lirik})
df = pd.DataFrame(rows, columns = df_cols)
lyrics = df['LYRICS'][id-1]
judul = df['SONG'][id-1]
return lyrics ,judul
for node in tree.iter("LYRICS"):
# all_text.append(node.text.replace("\n"," "))
all_text.append(node.text)
N_DOC = len(all_text)
text = []
judul=[]
hasil = []
id = str(nomor)
for i in range(N_DOC):
check = all_doc_no[i]
if check == id:
text = all_text[i]
judul = all_song[i]
return text,judul
\ No newline at end of file
......@@ -55,15 +55,13 @@ footer {
border-radius: 15px;
padding: 20px;
margin-top: 10px;
width: auto;
width: 100%;
}
.carda {
box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2);
border-radius: 15px;
padding: 20px;
margin-top: 10px;
width: max-content;
table{
table-layout: fixed;
border: 1px solid black;
width: 100px;
}
.jumbotron {
......@@ -155,11 +153,6 @@ button:hover span:after {
right: 0;
}
table, th, td {
border: 1px solid black;
border-collapse: collapse;
}
form button {
display: inline-block;
......
@import url('https://fonts.googleapis.com/css?family=Quicksand:400,700&display=swap');
body {
font-family: sans-serif;
}
h2, h3 {
color: #00a2c6
}
footer {
color: white;
background-color: #591a75
}
nav a {
font-size: 18px;
font-weight: 400;
text-decoration: none;
}
nav a:hover {
font-weight: bold;
}
.profile header {
text-align: center;
}
footer {
position: fixed;
left: 0;
bottom: 0;
width: 100%;
padding: 5px;
color: white;
background-color: #440f5c;
text-align: center;
font-weight: bold;
}
.featured-image {
width: 100%;
max-height: 300px;
object-fit: cover;
object-position: center;
}
.card {
box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2);
border-radius: 15px;
padding: 20px;
margin-top: 10px;
}
.jumbotron {
font-size: 20px;
padding: 60px;
text-align: center;
color: white;
background-image: url(https://ak.picdn.net/assets/cms/music_subscription_homepage_banner.jpg);
background-size: cover;
background-repeat: no-repeat;
text-shadow: black 0.3em 0.3em 0.3em;
}
nav {
background-color: #091729;
padding: 5px;
position: sticky;
top: 0;
}
nav a {
font-size: 18px;
font-weight: 400;
text-decoration: none;
color: white;
}
body {
font-family: 'Quicksand', sans-serif;
margin: 0;
padding: 0;
}
main {
padding: 15px;
overflow: auto;
}
#content {
width: 100%;
}
* {
box-sizing: border-box;
}
.button {
display: inline-block;
border-radius: 4px;
background-color: #7c1ca6;
border: none;
color: #FFFFFF;
text-align: center;
font-size: 15px;
padding: 20px;
transition: all 0.5s;
cursor: pointer;
margin: 5px;
}
button span {
cursor: pointer;
display: inline-block;
position: relative;
transition: 0.5s;
}
button span:after {
content: '\00bb';
position: absolute;
opacity: 0;
top: 0;
right: -20px;
transition: 0.5s;
}
button:hover span {
padding-right: 25px;
}
button:hover span:after {
opacity: 1;
right: 0;
}
form button {
display: inline-block;
border-radius: 4px;
background-color: #7c1ca6;
border: none;
color: #FFFFFF;
text-align: center;
font-size: 15px;
padding: 10px;
transition: all 0.5s;
cursor: pointer;
margin: 5px;
width: 80px;
}
\ No newline at end of file
......@@ -5,23 +5,6 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
<style>
#leftbox {
text-align: center;
float:left;
white-space: nowrap;
}
#middlebox{
float:left;
text-align: center;
white-space: nowrap;
}
#middleboxb{
float:left;
text-align: left;
white-space: nowrap;
}
</style>
</head>
<body>
......@@ -38,69 +21,23 @@
</div>
<center><h1>Dataset</h1><br></center>
<article class="carda" style="overflow-x:scroll; overflow-y:scroll;">
<div id = "leftbox">
<table>
<tr>
<th>DOCNO</th>
</tr>
{% for i in DOCNO %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table align="left">
<tr>
<th>SONG</th>
</tr>
{% for i in SONG %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table>
<tr>
<th>ARTIST</th>
</tr>
{% for i in ARTIST %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middleboxb">
<table>
<tr>
<th>SONG</th>
<th>LYRICS</th>
</tr>
{% for i in LYRICS %}
{% for i in DOCNO %}
<tr>
<td>{{ i }}</td>
<td>{{ j }}</td>
<td>{{ k }}</td>
<td>{{ l }}</td>
</tr>
{% endfor %}
</table>
</div>
</article>
</article>
</div>
......
......@@ -4,24 +4,7 @@
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
<style>
#leftbox {
text-align: center;
float:left;
white-space: nowrap;
}
#middlebox{
float:left;
text-align: center;
white-space: nowrap;
}
#middleboxb{
float:left;
text-align: left;
white-space: nowrap;
}
</style>
<link href="../../static/assets/css/trying.min.css" rel="stylesheet">
</head>
<body>
......@@ -37,41 +20,20 @@
</div>
</div>
<center><h1>Proximity Index</h1><br></center>
<article class="carda" style="overflow-x:scroll; overflow-y:scroll;">
<div id = "leftbox">
<table>
<center><p style="font-size:40px;"><strong>Indexing</strong></p>
<table width="100%"; border="1px solid black">
<tr>
<th>Token</th>
</tr>
{% for i in words %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middleboxb">
<table align="left">
<tr>
<th>Index</th>
</tr>
{% for i in freq %}
{% for key, values in res.items %}
<tr>
<td>{{ i }}</td>
<td>{{ key }}</td>
<td>{{ values }}</td>
</tr>
{% endfor %}
</table>
</div>
</article>
</article>
</div>
......
<!DOCTYPE html>
<html lang="en">
<head>
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
......@@ -21,61 +21,27 @@
<!-- Custom styles for this template -->
<link href="../../static/assets/css/landing-page.min.css" rel="stylesheet">
</head>
</head>
<body>
<body>
<!-- Navigation -->
<nav class="navbar navbar-light bg-light static-top">
<div class="container">
<a class="navbar-brand" href="/">Cari Lagu</a>
<!-- <a class="btn btn-primary" href="#">Pilih Buku</a>
-->
<a class="navbar-brand" href="/">Search Simulator</a>
</div>
</nav>
<!-- Masthead -->
<!-- <header class="masthead text-white text-center">
<div class="overlay"></div>
<div class="container">
<div class="row">
<div class="col-xl-9 mx-auto">
<h1 class="mb-5">Silahkan masukkan lirik dari lagu yang ingin Anda temukan</h1>
</div>
<div class="col-md-10 col-lg-8 col-xl-7 mx-auto">
<form method="POST" action="/search">
<div class="form-row">
<div class="col-12 col-md-9 mb-2 mb-md-0">
<input type="text" class="form-control form-control-lg" name="querysearch" placeholder="Masukkan Query Anda...">
</div>
<div class="col-12 col-md-3">
<button type="submit" class="btn btn-block btn-lg btn-primary">Cari!</button>
</div>
</div>
</form>
</div>
</div>
</div>
</header> -->
<!-- Testimonials -->
<section class="testimonials text-center bg-light">
<div class="container">
<h2 class="mb-3">Lirik Lagu</h2>
<h4 class="mb-3">No.{{no}} - {{judul}} </h4>
<p>{{text}}</p>
<h4 class="mb-3">No. {{ no }} - {{ judul }} </h4>
<p>{{ lyrics }}</p>
</div>
</section>
<!-- Bootstrap core JavaScript -->
<script src="../../static/assets/vendor/jquery/jquery.min.js"></script>
<script src="../../static/assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
</body>
</body>
</html>
......@@ -24,41 +24,26 @@
</head>
<body>
<!-- Navigation -->
<nav class="navbar navbar-light bg-light static-top">
<div class="container">
<a class="navbar-brand" href="/">CariLagu</a>
<!-- <a class="btn btn-primary" href="#">Pilih Buku</a>
-->
<a class="navbar-brand" href="/">Search Simulator</a>
</div>
</nav>
<!-- Testimonials -->
<section class="testimonials text-center bg-light">
<div class="container">
<h2 class="mb-5">Lagu yang sesuai dengan "{{ query }}"</h2>
{% if hasil %}
<h2 class="mb-5">Lagu yang sesuai dengan query "{{ query }}"</h2>
<div class="row">
{% for i in hasil %}
{% for j in i %}
{% for key, values in res.items %}
<div class="col-lg-4">
<div class="testimonial-item mx-auto mb-5 mb-lg-0">
<img class="img-fluid rounded-circle mb-3" src="../../static/img/hkbp.jpg" alt="">
<h5><a href="/lyric">Lagu No:{{ j.docno }}</a></h5>
<h5>"{{ j.judul }}"</h5>
<p class="font-weight-light mb-0">score :{{ j.score }}</p>
<h5><a href="/lyric/{{ key }}">Lagu No: {{ key }}</a></h5>
<h5>"{{ values }}"</h5>
</div>
</div>
{% endfor %}
{% endfor %}
</div>
{% else %}
<h2 class="mb-5">Lagu dengan lirik: "{{ query }}" tidak ditemukan</h2>
{% endif %}
</div>
</section>
......
from django.shortcuts import render
from django.http import HttpResponse
from InvertedIndexSimulator.inverted import main
from xml.etree.ElementTree import ElementTree
from sklearn.feature_extraction.text import CountVectorizer
from itertools import count
import pandas as pd
import xml.etree.ElementTree as et
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import json
import xml.dom.minidom as minidom
import collections
from itertools import count
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
......@@ -17,138 +19,56 @@ except ImportError: # not 2.6+ or is 3.x
except ImportError:
pass
def home(request):
return render(request, 'apps/home.html')
def dataframe(request):
parse_data = et.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
data = parse_data.getroot()
df_cols = ["DOCNO", "SONG", "ARTIST", "LYRICS"]
rows = []
for node in data:
s_docno = node.find("DOCNO").text if node is not None else None
s_song = node.find("SONG").text if node is not None else None
s_artist = node.find("ARTIST").text if node is not None else None
s_lyrics = node.find("LYRICS").text if node is not None else None
rows.append({"DOCNO": s_docno, "SONG": s_song, "ARTIST": s_artist, "LYRICS": s_lyrics})
DataFrame = pd.DataFrame(rows, columns = df_cols)
dictionary = DataFrame.set_index('DOCNO').T.to_dict('list')
nilai = list(dictionary.values())
nomornya = list(dictionary.keys())
lagunya = [sublist[0] for sublist in nilai]
artisnya = [sublist[1] for sublist in nilai]
liriknya = [sublist[2] for sublist in nilai]
context = {"DOCNO": nomornya, "SONG": lagunya, "ARTIST": artisnya, "LYRICS": liriknya}
context = main.show_dataframe(parse_data)
return render(request, 'apps/dataframe.html', context)
def preprocessing(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc = main.data_var(tree)
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
context = {"tokens_doc": tokens_doc}
context = {
"tokens_doc": tokens_doc
}
return render(request, 'apps/preprocessing.html', context)
def preprocessing2(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc = main.data_var(tree)
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
context = {"tokens_doc": tokens_doc}
context = {
"tokens_doc": tokens_doc
}
return render(request, 'apps/preprocessing2.html', context)
def preprocessing3(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc = main.data_var(tree)
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
......@@ -161,37 +81,18 @@ def preprocessing3(request):
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
context = {"tokens_doc": tokens_doc}
context = {
"tokens_doc": tokens_doc
}
return render(request, 'apps/preprocessing3.html', context)
def preprocessing4(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
all_doc_no, all_song, all_lyrics, N_DOC, all_sentence_doc = main.data_var(tree)
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
......@@ -207,33 +108,17 @@ def preprocessing4(request):
for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i])
context = {"tokens_doc": tokens_doc}
context = {
"tokens_doc": tokens_doc
}
return render(request, 'apps/preprocessing4.html', context)
def indexing(request):
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
import xml.dom.minidom as minidom
dcmnt_xml = minidom.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')
all_profile = dcmnt_xml.getElementsByTagName('SONG')
all_date = dcmnt_xml.getElementsByTagName('ARTIST')
all_text = dcmnt_xml.getElementsByTagName('LYRICS')
all_pub = dcmnt_xml.getElementsByTagName('PUB')
all_page = dcmnt_xml.getElementsByTagName('PAGE')
N_DOC = len(all_doc_no)
all_sentence_doc_sample = []
for i in range(N_DOC):
sentence_doc_sample = ' '+ all_text[i].firstChild.data
all_sentence_doc_sample.append(sentence_doc_sample)
all_doc_no, N_DOC, all_sentence_doc_sample = main.load_data(dcmnt_xml)
tokens_doc = []
......@@ -249,65 +134,40 @@ def indexing(request):
for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i])
all_tokens = []
for i in range(N_DOC):
for w in tokens_doc[i]:
all_tokens.append(w)
new_sentence = ' '.join([w for w in all_tokens])
for w in CountVectorizer().build_tokenizer()(new_sentence):
all_tokens.append(w)
all_tokens = set(all_tokens)
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
for n in range(N_DOC):
if(token in tokens_doc[n]):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
import json
indexnya = json.loads(json.dumps(proximity_index))
res = main.indexing(N_DOC, tokens_doc, all_doc_no)
words = indexnya.keys()
freq = indexnya.values()
context = {"words": words, "freq": freq}
context = {
"res": res,
}
return render(request, 'apps/indexing.html', context)
def index(request):
return render(request, 'apps/index.html')
def lyric(request,id):
text, judul = main.detail(id)
content={
'no': id,
'judul':judul,
'text':text
}
return render(request, 'apps/lyric.html', content)
def result(request):
#%%
# proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
# for key, value in proximity_index.items():
# # print (key, value)
dcmnt_xml = minidom.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
if request.method == 'POST':
query = request.POST['querysearch']
hasil= main.main(query)
res = main.searching(dcmnt_xml, query)
content={
'hasil':hasil,
content = {
'res':res,
'query':query
}
return render(request, 'apps/result.html', content)
def lyric(request,id):
lyrics, judul = main.detail(id)
content = {
'no': id,
'judul':judul,
'lyrics':lyrics,
}
return render(request, 'apps/lyric.html', content)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment