Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
SearchEngine
Project
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Rosa Delima Mendrofa
SearchEngine
Commits
8f565798
Commit
8f565798
authored
May 28, 2020
by
Yolanda Nainggolan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added searching, fixed result and lyrics
parent
2d25e3c9
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
466 additions
and
532 deletions
+466
-532
views.cpython-37.pyc
...e/InvertedIndexSimulator/__pycache__/views.cpython-37.pyc
+0
-0
main.cpython-37.pyc
...edIndexSimulator/inverted/__pycache__/main.cpython-37.pyc
+0
-0
main.py
SearchEngine/InvertedIndexSimulator/inverted/main.py
+233
-162
dataframe.min.css
...nvertedIndexSimulator/static/assets/css/dataframe.min.css
+5
-12
trying.min.css
...e/InvertedIndexSimulator/static/assets/css/trying.min.css
+159
-0
dataframe.html
...gine/InvertedIndexSimulator/templates/apps/dataframe.html
+5
-68
indexing.html
...ngine/InvertedIndexSimulator/templates/apps/indexing.html
+6
-44
lyric.html
...chEngine/InvertedIndexSimulator/templates/apps/lyric.html
+7
-41
result.html
...hEngine/InvertedIndexSimulator/templates/apps/result.html
+6
-21
views.py
SearchEngine/InvertedIndexSimulator/views.py
+45
-184
urls.cpython-37.pyc
SearchEngine/SearchEngine/__pycache__/urls.cpython-37.pyc
+0
-0
No files found.
SearchEngine/InvertedIndexSimulator/__pycache__/views.cpython-37.pyc
View file @
8f565798
No preview for this file type
SearchEngine/InvertedIndexSimulator/inverted/__pycache__/main.cpython-37.pyc
View file @
8f565798
No preview for this file type
SearchEngine/InvertedIndexSimulator/inverted/main.py
View file @
8f565798
...
...
@@ -2,19 +2,86 @@ resource_package = __name__
import
string
import
re
import
collections
import
math
import
pandas
as
pd
import
json
import
xml.dom.minidom
as
minidom
import
xml.etree.ElementTree
as
et
from
xml.etree.ElementTree
import
ElementTree
from
sklearn.feature_extraction.text
import
CountVectorizer
from
nltk.corpus
import
stopwords
from
nltk.tokenize
import
sent_tokenize
,
word_tokenize
from
Sastrawi.Stemmer.StemmerFactory
import
StemmerFactory
from
itertools
import
count
import
collections
import
math
import
xml.etree.ElementTree
as
et
from
xml.etree.ElementTree
import
ElementTree
try
:
from
future_builtins
import
zip
except
ImportError
:
# not 2.6+ or is 3.x
try
:
from
itertools
import
izip
as
zip
# < 2.5 or 3.x
except
ImportError
:
pass
##############Show Dataframe########################
def
show_dataframe
(
parse_data
):
data
=
parse_data
.
getroot
()
df_cols
=
[
"DOCNO"
,
"SONG"
,
"ARTIST"
,
"LYRICS"
]
rows
=
[]
for
node
in
data
:
s_docno
=
node
.
find
(
"DOCNO"
)
.
text
if
node
is
not
None
else
None
s_song
=
node
.
find
(
"SONG"
)
.
text
if
node
is
not
None
else
None
s_artist
=
node
.
find
(
"ARTIST"
)
.
text
if
node
is
not
None
else
None
s_lyrics
=
node
.
find
(
"LYRICS"
)
.
text
if
node
is
not
None
else
None
rows
.
append
({
"DOCNO"
:
s_docno
,
"SONG"
:
s_song
,
"ARTIST"
:
s_artist
,
"LYRICS"
:
s_lyrics
})
DataFrame
=
pd
.
DataFrame
(
rows
,
columns
=
df_cols
)
dictionary
=
DataFrame
.
set_index
(
'DOCNO'
)
.
T
.
to_dict
(
'list'
)
##############Remove Punctuation, URL and Tokenize###################
nilai
=
list
(
dictionary
.
values
())
nomornya
=
list
(
dictionary
.
keys
())
for
i
in
range
(
0
,
len
(
nomornya
)):
nomornya
[
i
]
=
int
(
nomornya
[
i
])
lagunya
=
[
sublist
[
0
]
for
sublist
in
nilai
]
artisnya
=
[
sublist
[
1
]
for
sublist
in
nilai
]
liriknya
=
[
sublist
[
2
]
for
sublist
in
nilai
]
context
=
{
"DOCNO"
:
nomornya
,
"SONG"
:
lagunya
,
"ARTIST"
:
artisnya
,
"LYRICS"
:
liriknya
}
return
context
##############N_DOC########################
def
data_var
(
tree
):
tree
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
all_doc_no
=
[]
all_song
=
[]
all_lyrics
=
[]
for
node
in
tree
.
iter
(
"DOCNO"
):
all_doc_no
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"SONG"
):
all_song
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"LYRICS"
):
all_lyrics
.
append
(
node
.
text
)
N_DOC
=
len
(
all_lyrics
)
all_sentence_doc
=
[]
for
i
in
range
(
N_DOC
):
all_sentence_doc
.
append
(
all_song
[
i
]
+
all_lyrics
[
i
])
return
all_doc_no
,
all_song
,
all_lyrics
,
N_DOC
,
all_sentence_doc
##############Remove Punctuation###################
def
remove_punc_tokenize
(
sentence
):
tokens
=
[]
for
punctuation
in
string
.
punctuation
:
sentence
=
sentence
.
replace
(
punctuation
,
" "
)
...
...
@@ -30,23 +97,79 @@ def to_lower(tokens):
tokens
=
[
x
.
lower
()
for
x
in
tokens
]
return
tokens
def
generate_ngrams
(
data
,
n
):
ngram
=
[]
result
=
[]
##############Load Data########################
def
load_data
(
dcmnt_xml
):
all_doc_no
=
dcmnt_xml
.
getElementsByTagName
(
'DOCNO'
)
all_profile
=
dcmnt_xml
.
getElementsByTagName
(
'SONG'
)
all_date
=
dcmnt_xml
.
getElementsByTagName
(
'ARTIST'
)
all_text
=
dcmnt_xml
.
getElementsByTagName
(
'LYRICS'
)
all_pub
=
dcmnt_xml
.
getElementsByTagName
(
'PUB'
)
all_page
=
dcmnt_xml
.
getElementsByTagName
(
'PAGE'
)
N_DOC
=
len
(
all_doc_no
)
all_sentence_doc_sample
=
[]
for
i
in
range
(
N_DOC
):
sentence_doc_sample
=
' '
+
all_text
[
i
]
.
firstChild
.
data
all_sentence_doc_sample
.
append
(
sentence_doc_sample
)
return
all_doc_no
,
N_DOC
,
all_sentence_doc_sample
##############Indexing########################
def
indexing
(
N_DOC
,
tokens_doc
,
all_doc_no
):
all_tokens
=
[]
for
i
in
range
(
N_DOC
):
for
w
in
tokens_doc
[
i
]:
all_tokens
.
append
(
w
)
new_sentence
=
' '
.
join
([
w
for
w
in
all_tokens
])
for
w
in
CountVectorizer
()
.
build_tokenizer
()(
new_sentence
):
all_tokens
.
append
(
w
)
all_tokens
=
set
(
all_tokens
)
proximity_index
=
{}
for
token
in
all_tokens
:
dict_doc_position
=
{}
for
n
in
range
(
N_DOC
):
if
(
token
in
tokens_doc
[
n
]):
dict_doc_position
[
all_doc_no
[
n
]
.
firstChild
.
data
]
=
[
i
+
1
for
i
,
j
in
zip
(
count
(),
tokens_doc
[
n
])
if
j
==
token
]
proximity_index
[
token
]
=
dict_doc_position
proximity_index
=
collections
.
OrderedDict
(
sorted
(
proximity_index
.
items
()))
indexnya
=
json
.
loads
(
json
.
dumps
(
proximity_index
))
words
=
indexnya
.
keys
()
freq
=
indexnya
.
values
()
freq
=
list
(
freq
)
hasil
=
{}
for
key
in
words
:
for
value
in
freq
:
hasil
[
key
]
=
value
freq
.
remove
(
value
)
break
#menampilkan hasil n-gram per dokumen
for
i
in
range
(
len
(
data
)):
sequences
=
[
data
[
i
][
j
:]
for
j
in
range
(
n
)]
temp
=
zip
(
*
sequences
)
lst
=
list
(
temp
)
result
.
append
([
" "
.
join
(
lst
)
for
lst
in
lst
])
numb
=
[]
idx
=
[]
for
i
,
j
in
hasil
.
items
():
numb
.
append
(
i
)
idx
.
append
(
j
)
res
=
{}
for
key
in
numb
:
for
value
in
idx
:
res
[
key
]
=
value
idx
.
remove
(
value
)
break
return
res
#menggabungkan n-gram semua dokumen dalam bentuk array
for
i
in
range
(
len
(
result
)):
for
j
in
range
(
len
(
result
[
i
])):
ngram
.
append
(
result
[
i
][
j
])
return
ngram
,
result
from
nltk.corpus
import
stopwords
stop_words
=
set
(
stopwords
.
words
(
'english'
))
...
...
@@ -64,30 +187,19 @@ def stemming(tokens):
return
tokens
def
searching
(
dcmnt_xml
,
query
):
def
main
(
query
):
tree
=
ElementTree
()
tree
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
all_doc_no
=
[]
all_song
=
[]
all_text
=
[]
for
node
in
tree
.
iter
(
"DOCNO"
):
all_doc_no
.
append
(
node
.
text
)
all_doc_no
=
dcmnt_xml
.
getElementsByTagName
(
'DOCNO'
)
all_song
=
dcmnt_xml
.
getElementsByTagName
(
'SONG'
)
all_lyrics
=
dcmnt_xml
.
getElementsByTagName
(
'LYRICS'
)
for
node
in
tree
.
iter
(
"SONG"
):
all_song
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"LYRICS"
):
all_text
.
append
(
node
.
text
)
N_DOC
=
len
(
all_text
)
N_DOC
=
len
(
all_doc_no
)
all_sentence_doc
=
[]
for
i
in
range
(
N_DOC
):
all_sentence_doc
.
append
(
all_song
[
i
]
+
all_text
[
i
])
sentence_doc
=
all_song
[
i
]
.
firstChild
.
data
+
' '
+
all_lyrics
[
i
]
.
firstChild
.
data
all_sentence_doc
.
append
(
sentence_doc
)
tokens_doc
=
[]
for
i
in
range
(
N_DOC
):
tokens_doc
.
append
(
remove_punc_tokenize
(
all_sentence_doc
[
i
]))
...
...
@@ -95,158 +207,118 @@ def main(query):
for
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
to_lower
(
tokens_doc
[
i
])
stop_words
=
set
(
stopwords
.
words
(
'english'
))
stopping
=
[]
for
i
in
range
(
N_DOC
):
temp
=
[]
for
j
in
tokens_doc
[
i
]:
if
j
not
in
stop_words
:
temp
.
append
(
j
)
stopping
.
append
(
temp
)
tokens_doc
[
i
]
=
stop_word_token
(
tokens_doc
[
i
])
for
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
([
w
for
w
in
stopping
[
i
]
if
not
any
(
j
.
isdigit
()
for
j
in
w
)])
tokens_doc
[
i
]
=
([
w
for
w
in
tokens_doc
[
i
]
if
not
any
(
j
.
isdigit
()
for
j
in
w
)])
f
actory
=
StemmerFactory
()
stemmer
=
factory
.
create_stemmer
(
)
f
or
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
stemming
(
tokens_doc
[
i
]
)
stemming
=
[]
all_tokens
=
[]
for
i
in
range
(
N_DOC
):
temp
=
[]
for
j
in
tokens_doc
[
i
]:
# print(j)
temp
.
append
(
stemmer
.
stem
(
j
))
stemming
.
append
(
temp
)
all_tokens
.
append
(
j
)
all_tokens
=
[]
for
i
in
range
(
N_DOC
):
for
w
in
stemming
[
i
]:
all_tokens
.
append
(
w
)
new_sentences
=
' '
.
join
([
w
for
w
in
all_tokens
])
new_sentence
=
' '
.
join
([
w
for
w
in
all_tokens
])
for
w
in
CountVectorizer
()
.
build_tokenizer
()(
new_sentence
):
all_tokens
.
append
(
w
)
for
j
in
CountVectorizer
()
.
build_tokenizer
()(
new_sentences
):
all_tokens
.
append
(
j
)
all_tokens
=
set
(
all_tokens
)
alls
=
[]
for
i
in
all_tokens
:
alls
.
append
(
i
)
proximity_index
=
{}
for
token
in
all_tokens
:
dict_doc_position
=
{}
for
n
in
range
(
N_DOC
):
if
(
token
in
tokens_doc
[
n
]):
dict_doc_position
[
all_doc_no
[
n
]
.
firstChild
.
data
]
=
[
i
+
1
for
i
,
j
in
zip
(
count
(),
tokens_doc
[
n
])
if
j
==
token
]
proximity_index
[
token
]
=
dict_doc_position
queri
=
[]
spl
=
query
.
split
()
for
i
in
range
(
len
(
spl
)):
if
not
spl
[
i
]
.
isdigit
():
queri
.
append
(
spl
[
i
])
import
collections
proximity_index
=
collections
.
OrderedDict
(
sorted
(
proximity_index
.
items
()))
punc
=
[]
for
i
in
range
(
len
(
queri
)):
no_punc
=
""
for
j
in
range
(
len
(
queri
[
i
])):
if
queri
[
i
][
j
]
not
in
string
.
punctuation
:
no_punc
=
no_punc
+
queri
[
i
][
j
]
punc
.
append
(
no_punc
)
kunci
=
[]
nilai
=
[]
for
key
,
value
in
proximity_index
[
query
]
.
items
():
kunci
.
append
(
key
)
nilai
.
append
(
value
)
lower
=
[]
for
i
in
range
(
len
(
punc
)):
lower
.
append
(
punc
[
i
]
.
lower
())
dict
=
{}
for
key
in
kunci
:
for
value
in
nilai
:
dict
[
key
]
=
value
nilai
.
remove
(
value
)
break
stop
=
[]
for
i
in
range
(
len
(
lower
)):
if
lower
[
i
]
not
in
stop_words
:
stop
.
append
(
lower
[
i
])
xtree
=
et
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
xroot
=
xtree
.
getroot
()
stem
=
[]
for
i
in
range
(
len
(
stop
)):
stem
.
append
(
stemmer
.
stem
(
stop
[
i
]))
df_cols
=
[
"SONG"
]
rows
=
[]
join_word
=
' '
.
join
([
w
for
w
in
stem
])
for
node
in
xroot
:
lirik
=
node
.
find
(
"SONG"
)
.
text
if
node
is
not
None
else
None
ngram
,
ngram_doc
=
generate_ngrams
(
stemming
,
len
(
stem
)
)
rows
.
append
({
"SONG"
:
lirik
}
)
n_gram_index
=
{}
for
ngram_token
in
ngram
:
doc_no
=
[]
for
i
in
range
(
N_DOC
):
if
(
ngram_token
in
ngram_doc
[
i
]):
doc_no
.
append
(
all_doc_no
[
i
])
n_gram_index
[
ngram_token
]
=
doc_no
df
=
pd
.
DataFrame
(
rows
,
columns
=
df_cols
)
df
=
[]
nomor
=
[]
for
i
in
dict
:
nomor
.
append
(
int
(
i
))
for
i
in
range
(
N_DOC
):
count
=
0
for
j
in
range
(
len
(
ngram_doc
[
i
])):
if
join_word
==
ngram_doc
[
i
][
j
]:
count
+=
1
df
.
append
(
count
)
idf
=
[]
for
i
in
range
(
len
(
df
)):
try
:
idf
.
append
(
math
.
log10
(
N_DOC
/
df
[
i
]))
except
ZeroDivisionError
:
idf
.
append
(
str
(
0
))
#w(t, d)
#t = term
#d = document
wtd
=
[]
l
=
[]
for
i
in
range
(
N_DOC
):
dic
=
{}
tf
=
ngram_doc
[
i
]
.
count
(
join_word
)
# menghitung nilai tf
if
tf
!=
0
:
score
=
math
.
log10
(
tf
)
#log10(tf(t,d))
score
+=
1
# 1 + log(tf(t,d))
score
*=
idf
[
i
]
#tf * idf
judul
=
[]
for
i
in
nomor
:
judul
.
append
(
df
[
'SONG'
][
i
-
1
])
idx
=
all_doc_no
[
i
]
judul
=
all_song
[
i
]
hasil
=
{}
for
key
in
nomor
:
for
value
in
judul
:
hasil
[
key
]
=
value
judul
.
remove
(
value
)
break
dic
[
'docno'
]
=
idx
dic
[
'judul'
]
=
judul
dic
[
'score'
]
=
score
numb
=
[]
tit
=
[]
l
.
append
(
dic
)
wtd
.
append
(
l
)
# [i+1] = defenisi nomor dokumen; score = wtd
# print(score
)
for
i
,
j
in
hasil
.
items
():
numb
.
append
(
i
)
tit
.
append
(
j
)
hasil
=
[]
hasil
.
append
(
sorted
(
wtd
[
0
],
key
=
lambda
x
:
x
[
'score'
],
reverse
=
True
))
res
=
{}
for
key
in
numb
:
for
value
in
tit
:
res
[
key
]
=
value
tit
.
remove
(
value
)
break
return
hasil
return
res
def
detail
(
nomor
):
tree
=
et
()
tree
.
parse
(
"apps/data/dataset_STBI.xml"
)
def
detail
(
id
):
all_doc_no
=
[]
all_song
=
[]
all_text
=
[]
import
pandas
as
pd
import
xml.etree.ElementTree
as
et
import
numpy
as
np
for
node
in
tree
.
iter
(
"DOCNO"
):
all_doc_no
.
append
(
node
.
text
)
xtree
=
et
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
xroot
=
xtree
.
getroot
()
for
node
in
tree
.
iter
(
"SONG"
):
# all_song.append(node.text.replace("\n"," "))
all_song
.
append
(
node
.
text
)
head
=
all_song
df_cols
=
[
"SONG"
,
"LYRICS"
]
rows
=
[]
for
node
in
xroot
:
judul
=
node
.
find
(
"SONG"
)
.
text
if
node
is
not
None
else
None
lirik
=
node
.
find
(
"LYRICS"
)
.
text
if
node
is
not
None
else
None
rows
.
append
({
"SONG"
:
judul
,
"LYRICS"
:
lirik
})
df
=
pd
.
DataFrame
(
rows
,
columns
=
df_cols
)
lyrics
=
df
[
'LYRICS'
][
id
-
1
]
judul
=
df
[
'SONG'
][
id
-
1
]
return
lyrics
,
judul
for
node
in
tree
.
iter
(
"LYRICS"
):
# all_text.append(node.text.replace("\n"," "))
all_text
.
append
(
node
.
text
)
N_DOC
=
len
(
all_text
)
text
=
[]
judul
=
[]
hasil
=
[]
id
=
str
(
nomor
)
for
i
in
range
(
N_DOC
):
check
=
all_doc_no
[
i
]
if
check
==
id
:
text
=
all_text
[
i
]
judul
=
all_song
[
i
]
return
text
,
judul
\ No newline at end of file
SearchEngine/InvertedIndexSimulator/static/assets/css/dataframe.min.css
View file @
8f565798
...
...
@@ -55,15 +55,13 @@ footer {
border-radius
:
15px
;
padding
:
20px
;
margin-top
:
10px
;
width
:
auto
;
width
:
100%
;
}
.carda
{
box-shadow
:
0
4px
8px
0
rgba
(
0
,
0
,
0
,
0.2
);
border-radius
:
15px
;
padding
:
20px
;
margin-top
:
10px
;
width
:
max-content
;
table
{
table-layout
:
fixed
;
border
:
1px
solid
black
;
width
:
100px
;
}
.jumbotron
{
...
...
@@ -155,11 +153,6 @@ button:hover span:after {
right
:
0
;
}
table
,
th
,
td
{
border
:
1px
solid
black
;
border-collapse
:
collapse
;
}
form
button
{
display
:
inline-block
;
...
...
SearchEngine/InvertedIndexSimulator/static/assets/css/trying.min.css
0 → 100644
View file @
8f565798
@import
url('https://fonts.googleapis.com/css?family=Quicksand:400,700&display=swap')
;
body
{
font-family
:
sans-serif
;
}
h2
,
h3
{
color
:
#00a2c6
}
footer
{
color
:
white
;
background-color
:
#591a75
}
nav
a
{
font-size
:
18px
;
font-weight
:
400
;
text-decoration
:
none
;
}
nav
a
:hover
{
font-weight
:
bold
;
}
.profile
header
{
text-align
:
center
;
}
footer
{
position
:
fixed
;
left
:
0
;
bottom
:
0
;
width
:
100%
;
padding
:
5px
;
color
:
white
;
background-color
:
#440f5c
;
text-align
:
center
;
font-weight
:
bold
;
}
.featured-image
{
width
:
100%
;
max-height
:
300px
;
object-fit
:
cover
;
object-position
:
center
;
}
.card
{
box-shadow
:
0
4px
8px
0
rgba
(
0
,
0
,
0
,
0.2
);
border-radius
:
15px
;
padding
:
20px
;
margin-top
:
10px
;
}
.jumbotron
{
font-size
:
20px
;
padding
:
60px
;
text-align
:
center
;
color
:
white
;
background-image
:
url(https://ak.picdn.net/assets/cms/music_subscription_homepage_banner.jpg)
;
background-size
:
cover
;
background-repeat
:
no-repeat
;
text-shadow
:
black
0.3em
0.3em
0.3em
;
}
nav
{
background-color
:
#091729
;
padding
:
5px
;
position
:
sticky
;
top
:
0
;
}
nav
a
{
font-size
:
18px
;
font-weight
:
400
;
text-decoration
:
none
;
color
:
white
;
}
body
{
font-family
:
'Quicksand'
,
sans-serif
;
margin
:
0
;
padding
:
0
;
}
main
{
padding
:
15px
;
overflow
:
auto
;
}
#content
{
width
:
100%
;
}
*
{
box-sizing
:
border-box
;
}
.button
{
display
:
inline-block
;
border-radius
:
4px
;
background-color
:
#7c1ca6
;
border
:
none
;
color
:
#FFFFFF
;
text-align
:
center
;
font-size
:
15px
;
padding
:
20px
;
transition
:
all
0.5s
;
cursor
:
pointer
;
margin
:
5px
;
}
button
span
{
cursor
:
pointer
;
display
:
inline-block
;
position
:
relative
;
transition
:
0.5s
;
}
button
span
:after
{
content
:
'\00bb'
;
position
:
absolute
;
opacity
:
0
;
top
:
0
;
right
:
-20px
;
transition
:
0.5s
;
}
button
:hover
span
{
padding-right
:
25px
;
}
button
:hover
span
:after
{
opacity
:
1
;
right
:
0
;
}
form
button
{
display
:
inline-block
;
border-radius
:
4px
;
background-color
:
#7c1ca6
;
border
:
none
;
color
:
#FFFFFF
;
text-align
:
center
;
font-size
:
15px
;
padding
:
10px
;
transition
:
all
0.5s
;
cursor
:
pointer
;
margin
:
5px
;
width
:
80px
;
}
\ No newline at end of file
SearchEngine/InvertedIndexSimulator/templates/apps/dataframe.html
View file @
8f565798
...
...
@@ -5,23 +5,6 @@
<meta
name=
"viewport"
content=
"width=device-width, initial-scale=1"
>
<title>
Song Lyric Search Engine
</title>
<link
href=
"../../static/assets/css/dataframe.min.css"
rel=
"stylesheet"
>
<style>
#leftbox
{
text-align
:
center
;
float
:
left
;
white-space
:
nowrap
;
}
#middlebox
{
float
:
left
;
text-align
:
center
;
white-space
:
nowrap
;
}
#middleboxb
{
float
:
left
;
text-align
:
left
;
white-space
:
nowrap
;
}
</style>
</head>
<body>
...
...
@@ -38,69 +21,23 @@
</div>
<center><h1>
Dataset
</h1><br></center>
<article
class=
"carda"
style=
"overflow-x:scroll; overflow-y:scroll;"
>
<div
id =
"leftbox"
>
<table>
<tr>
<th>
DOCNO
</th>
</tr>
{% for i in DOCNO %}
<tr>
<td>
{{ i }}
</td>
</tr>
{% endfor %}
</table>
</div>
<div
id =
"middlebox"
>
<table
align=
"left"
>
<tr>
<th>
SONG
</th>
</tr>
{% for i in SONG %}
<tr>
<td>
{{ i }}
</td>
</tr>
{% endfor %}
</table>
</div>
<div
id =
"middlebox"
>
<table>
<tr>
<th>
ARTIST
</th>
</tr>
{% for i in ARTIST %}
<tr>
<td>
{{ i }}
</td>
</tr>
{% endfor %}
</table>
</div>
<div
id =
"middleboxb"
>
<table>
<tr>
<th>
SONG
</th>
<th>
LYRICS
</th>
</tr>
{% for i in LYRICS
%}
{% for i in DOCNO
%}
<tr>
<td>
{{ i }}
</td>
<td>
{{ j }}
</td>
<td>
{{ k }}
</td>
<td>
{{ l }}
</td>
</tr>
{% endfor %}
</table>
</div>
</article>
</article>
</div>
...
...
SearchEngine/InvertedIndexSimulator/templates/apps/indexing.html
View file @
8f565798
...
...
@@ -4,24 +4,7 @@
<head>
<meta
name=
"viewport"
content=
"width=device-width, initial-scale=1"
>
<title>
Song Lyric Search Engine
</title>
<link
href=
"../../static/assets/css/dataframe.min.css"
rel=
"stylesheet"
>
<style>
#leftbox
{
text-align
:
center
;
float
:
left
;
white-space
:
nowrap
;
}
#middlebox
{
float
:
left
;
text-align
:
center
;
white-space
:
nowrap
;
}
#middleboxb
{
float
:
left
;
text-align
:
left
;
white-space
:
nowrap
;
}
</style>
<link
href=
"../../static/assets/css/trying.min.css"
rel=
"stylesheet"
>
</head>
<body>
...
...
@@ -37,41 +20,20 @@
</div>
</div>
<center><h1>
Proximity Index
</h1><br></center>
<article
class=
"carda"
style=
"overflow-x:scroll; overflow-y:scroll;"
>
<div
id =
"leftbox"
>
<table>
<center><p
style=
"font-size:40px;"
><strong>
Indexing
</strong></p>
<table
width=
"100%"
;
border=
"1px solid black"
>
<tr>
<th>
Token
</th>
</tr>
{% for i in words %}
<tr>
<td>
{{ i }}
</td>
</tr>
{% endfor %}
</table>
</div>
<div
id =
"middleboxb"
>
<table
align=
"left"
>
<tr>
<th>
Index
</th>
</tr>
{% for i in freq
%}
{% for key, values in res.items
%}
<tr>
<td>
{{ i }}
</td>
<td>
{{ key }}
</td>
<td>
{{ values }}
</td>
</tr>
{% endfor %}
</table>
</div>
</article>
</article>
</div>
...
...
SearchEngine/InvertedIndexSimulator/templates/apps/lyric.html
View file @
8f565798
<!DOCTYPE html>
<html
lang=
"en"
>
<head>
<head>
<meta
charset=
"utf-8"
>
<meta
name=
"viewport"
content=
"width=device-width, initial-scale=1, shrink-to-fit=no"
>
...
...
@@ -21,61 +21,27 @@
<!-- Custom styles for this template -->
<link
href=
"../../static/assets/css/landing-page.min.css"
rel=
"stylesheet"
>
</head>
</head>
<body>
<body>
<!-- Navigation -->
<nav
class=
"navbar navbar-light bg-light static-top"
>
<div
class=
"container"
>
<a
class=
"navbar-brand"
href=
"/"
>
Cari Lagu
</a>
<!-- <a class="btn btn-primary" href="#">Pilih Buku</a>
-->
<a
class=
"navbar-brand"
href=
"/"
>
Search Simulator
</a>
</div>
</nav>
<!-- Masthead -->
<!-- <header class="masthead text-white text-center">
<div class="overlay"></div>
<div class="container">
<div class="row">
<div class="col-xl-9 mx-auto">
<h1 class="mb-5">Silahkan masukkan lirik dari lagu yang ingin Anda temukan</h1>
</div>
<div class="col-md-10 col-lg-8 col-xl-7 mx-auto">
<form method="POST" action="/search">
<div class="form-row">
<div class="col-12 col-md-9 mb-2 mb-md-0">
<input type="text" class="form-control form-control-lg" name="querysearch" placeholder="Masukkan Query Anda...">
</div>
<div class="col-12 col-md-3">
<button type="submit" class="btn btn-block btn-lg btn-primary">Cari!</button>
</div>
</div>
</form>
</div>
</div>
</div>
</header> -->
<!-- Testimonials -->
<section
class=
"testimonials text-center bg-light"
>
<div
class=
"container"
>
<h2
class=
"mb-3"
>
Lirik Lagu
</h2>
<h4
class=
"mb-3"
>
No.{{no}} - {{judul}}
</h4>
<p>
{{text}}
</p>
<h4
class=
"mb-3"
>
No. {{ no }} - {{ judul }}
</h4>
<p>
{{ lyrics }}
</p>
</div>
</section>
<!-- Bootstrap core JavaScript -->
<script
src=
"../../static/assets/vendor/jquery/jquery.min.js"
></script>
<script
src=
"../../static/assets/vendor/bootstrap/js/bootstrap.bundle.min.js"
></script>
</body>
</body>
</html>
SearchEngine/InvertedIndexSimulator/templates/apps/result.html
View file @
8f565798
...
...
@@ -24,41 +24,26 @@
</head>
<body>
<!-- Navigation -->
<nav
class=
"navbar navbar-light bg-light static-top"
>
<div
class=
"container"
>
<a
class=
"navbar-brand"
href=
"/"
>
CariLagu
</a>
<!-- <a class="btn btn-primary" href="#">Pilih Buku</a>
-->
<a
class=
"navbar-brand"
href=
"/"
>
Search Simulator
</a>
</div>
</nav>
<!-- Testimonials -->
<section
class=
"testimonials text-center bg-light"
>
<div
class=
"container"
>
<h2
class=
"mb-5"
>
Lagu yang sesuai dengan "{{ query }}"
</h2>
{% if hasil %}
<h2
class=
"mb-5"
>
Lagu yang sesuai dengan query "{{ query }}"
</h2>
<div
class=
"row"
>
{% for i in hasil %}
{% for j in i %}
{% for key, values in res.items %}
<div
class=
"col-lg-4"
>
<div
class=
"testimonial-item mx-auto mb-5 mb-lg-0"
>
<img
class=
"img-fluid rounded-circle mb-3"
src=
"../../static/img/hkbp.jpg"
alt=
""
>
<h5><a
href=
"/lyric"
>
Lagu No:{{ j.docno }}
</a></h5>
<h5>
"{{ j.judul }}"
</h5>
<p
class=
"font-weight-light mb-0"
>
score :{{ j.score }}
</p>
<h5><a
href=
"/lyric/{{ key }}"
>
Lagu No: {{ key }}
</a></h5>
<h5>
"{{ values }}"
</h5>
</div>
</div>
{% endfor %}
{% endfor %}
</div>
{% else %}
<h2
class=
"mb-5"
>
Lagu dengan lirik: "{{ query }}" tidak ditemukan
</h2>
{% endif %}
</div>
</section>
...
...
SearchEngine/InvertedIndexSimulator/views.py
View file @
8f565798
from
django.shortcuts
import
render
from
django.http
import
HttpResponse
from
InvertedIndexSimulator.inverted
import
main
from
xml.etree.ElementTree
import
ElementTree
from
sklearn.feature_extraction.text
import
CountVectorizer
from
itertools
import
count
import
pandas
as
pd
import
xml.etree.ElementTree
as
et
import
string
import
re
from
sklearn.feature_extraction.text
import
CountVectorizer
import
json
import
xml.dom.minidom
as
minidom
import
collections
from
itertools
import
count
try
:
from
future_builtins
import
zip
except
ImportError
:
# not 2.6+ or is 3.x
...
...
@@ -17,138 +19,56 @@ except ImportError: # not 2.6+ or is 3.x
except
ImportError
:
pass
def
home
(
request
):
return
render
(
request
,
'apps/home.html'
)
def
dataframe
(
request
):
parse_data
=
et
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
data
=
parse_data
.
getroot
()
df_cols
=
[
"DOCNO"
,
"SONG"
,
"ARTIST"
,
"LYRICS"
]
rows
=
[]
for
node
in
data
:
s_docno
=
node
.
find
(
"DOCNO"
)
.
text
if
node
is
not
None
else
None
s_song
=
node
.
find
(
"SONG"
)
.
text
if
node
is
not
None
else
None
s_artist
=
node
.
find
(
"ARTIST"
)
.
text
if
node
is
not
None
else
None
s_lyrics
=
node
.
find
(
"LYRICS"
)
.
text
if
node
is
not
None
else
None
rows
.
append
({
"DOCNO"
:
s_docno
,
"SONG"
:
s_song
,
"ARTIST"
:
s_artist
,
"LYRICS"
:
s_lyrics
})
DataFrame
=
pd
.
DataFrame
(
rows
,
columns
=
df_cols
)
dictionary
=
DataFrame
.
set_index
(
'DOCNO'
)
.
T
.
to_dict
(
'list'
)
nilai
=
list
(
dictionary
.
values
())
nomornya
=
list
(
dictionary
.
keys
())
lagunya
=
[
sublist
[
0
]
for
sublist
in
nilai
]
artisnya
=
[
sublist
[
1
]
for
sublist
in
nilai
]
liriknya
=
[
sublist
[
2
]
for
sublist
in
nilai
]
context
=
{
"DOCNO"
:
nomornya
,
"SONG"
:
lagunya
,
"ARTIST"
:
artisnya
,
"LYRICS"
:
liriknya
}
context
=
main
.
show_dataframe
(
parse_data
)
return
render
(
request
,
'apps/dataframe.html'
,
context
)
def
preprocessing
(
request
):
from
xml.etree.ElementTree
import
ElementTree
tree
=
ElementTree
()
tree
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
all_doc_no
=
[]
all_song
=
[]
all_text
=
[]
all_doc_no
,
all_song
,
all_lyrics
,
N_DOC
,
all_sentence_doc
=
main
.
data_var
(
tree
)
for
node
in
tree
.
iter
(
"DOCNO"
):
all_doc_no
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"SONG"
):
all_song
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"LYRICS"
):
all_text
.
append
(
node
.
text
)
N_DOC
=
len
(
all_text
)
all_sentence_doc
=
[]
for
i
in
range
(
N_DOC
):
all_sentence_doc
.
append
(
all_song
[
i
]
+
all_text
[
i
])
tokens_doc
=
[]
for
i
in
range
(
N_DOC
):
tokens_doc
.
append
(
main
.
remove_punc_tokenize
(
all_sentence_doc
[
i
]))
context
=
{
"tokens_doc"
:
tokens_doc
}
context
=
{
"tokens_doc"
:
tokens_doc
}
return
render
(
request
,
'apps/preprocessing.html'
,
context
)
def
preprocessing2
(
request
):
from
xml.etree.ElementTree
import
ElementTree
tree
=
ElementTree
()
tree
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
all_doc_no
=
[]
all_song
=
[]
all_text
=
[]
for
node
in
tree
.
iter
(
"DOCNO"
):
all_doc_no
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"SONG"
):
all_song
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"LYRICS"
):
all_text
.
append
(
node
.
text
)
N_DOC
=
len
(
all_text
)
all_sentence_doc
=
[]
for
i
in
range
(
N_DOC
):
all_sentence_doc
.
append
(
all_song
[
i
]
+
all_text
[
i
])
all_doc_no
,
all_song
,
all_lyrics
,
N_DOC
,
all_sentence_doc
=
main
.
data_var
(
tree
)
tokens_doc
=
[]
for
i
in
range
(
N_DOC
):
tokens_doc
.
append
(
main
.
remove_punc_tokenize
(
all_sentence_doc
[
i
]))
for
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
main
.
to_lower
(
tokens_doc
[
i
])
context
=
{
"tokens_doc"
:
tokens_doc
}
context
=
{
"tokens_doc"
:
tokens_doc
}
return
render
(
request
,
'apps/preprocessing2.html'
,
context
)
def
preprocessing3
(
request
):
from
xml.etree.ElementTree
import
ElementTree
tree
=
ElementTree
()
tree
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
all_doc_no
=
[]
all_song
=
[]
all_text
=
[]
for
node
in
tree
.
iter
(
"DOCNO"
):
all_doc_no
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"SONG"
):
all_song
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"LYRICS"
):
all_text
.
append
(
node
.
text
)
N_DOC
=
len
(
all_text
)
all_sentence_doc
=
[]
for
i
in
range
(
N_DOC
):
all_sentence_doc
.
append
(
all_song
[
i
]
+
all_text
[
i
])
all_doc_no
,
all_song
,
all_lyrics
,
N_DOC
,
all_sentence_doc
=
main
.
data_var
(
tree
)
tokens_doc
=
[]
for
i
in
range
(
N_DOC
):
tokens_doc
.
append
(
main
.
remove_punc_tokenize
(
all_sentence_doc
[
i
]))
...
...
@@ -161,37 +81,18 @@ def preprocessing3(request):
for
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
([
w
for
w
in
tokens_doc
[
i
]
if
not
any
(
j
.
isdigit
()
for
j
in
w
)])
context
=
{
"tokens_doc"
:
tokens_doc
}
context
=
{
"tokens_doc"
:
tokens_doc
}
return
render
(
request
,
'apps/preprocessing3.html'
,
context
)
def
preprocessing4
(
request
):
from
xml.etree.ElementTree
import
ElementTree
tree
=
ElementTree
()
tree
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
all_doc_no
=
[]
all_song
=
[]
all_text
=
[]
for
node
in
tree
.
iter
(
"DOCNO"
):
all_doc_no
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"SONG"
):
all_song
.
append
(
node
.
text
)
for
node
in
tree
.
iter
(
"LYRICS"
):
all_text
.
append
(
node
.
text
)
N_DOC
=
len
(
all_text
)
all_sentence_doc
=
[]
for
i
in
range
(
N_DOC
):
all_sentence_doc
.
append
(
all_song
[
i
]
+
all_text
[
i
])
all_doc_no
,
all_song
,
all_lyrics
,
N_DOC
,
all_sentence_doc
=
main
.
data_var
(
tree
)
tokens_doc
=
[]
for
i
in
range
(
N_DOC
):
tokens_doc
.
append
(
main
.
remove_punc_tokenize
(
all_sentence_doc
[
i
]))
...
...
@@ -207,33 +108,17 @@ def preprocessing4(request):
for
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
main
.
stemming
(
tokens_doc
[
i
])
context
=
{
"tokens_doc"
:
tokens_doc
}
context
=
{
"tokens_doc"
:
tokens_doc
}
return
render
(
request
,
'apps/preprocessing4.html'
,
context
)
def
indexing
(
request
):
import
string
import
re
from
sklearn.feature_extraction.text
import
CountVectorizer
import
xml.dom.minidom
as
minidom
dcmnt_xml
=
minidom
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
all_doc_no
=
dcmnt_xml
.
getElementsByTagName
(
'DOCNO'
)
all_profile
=
dcmnt_xml
.
getElementsByTagName
(
'SONG'
)
all_date
=
dcmnt_xml
.
getElementsByTagName
(
'ARTIST'
)
all_text
=
dcmnt_xml
.
getElementsByTagName
(
'LYRICS'
)
all_pub
=
dcmnt_xml
.
getElementsByTagName
(
'PUB'
)
all_page
=
dcmnt_xml
.
getElementsByTagName
(
'PAGE'
)
N_DOC
=
len
(
all_doc_no
)
all_sentence_doc_sample
=
[]
for
i
in
range
(
N_DOC
):
sentence_doc_sample
=
' '
+
all_text
[
i
]
.
firstChild
.
data
all_sentence_doc_sample
.
append
(
sentence_doc_sample
)
all_doc_no
,
N_DOC
,
all_sentence_doc_sample
=
main
.
load_data
(
dcmnt_xml
)
tokens_doc
=
[]
...
...
@@ -249,65 +134,40 @@ def indexing(request):
for
i
in
range
(
N_DOC
):
tokens_doc
[
i
]
=
main
.
stemming
(
tokens_doc
[
i
])
all_tokens
=
[]
for
i
in
range
(
N_DOC
):
for
w
in
tokens_doc
[
i
]:
all_tokens
.
append
(
w
)
new_sentence
=
' '
.
join
([
w
for
w
in
all_tokens
])
for
w
in
CountVectorizer
()
.
build_tokenizer
()(
new_sentence
):
all_tokens
.
append
(
w
)
all_tokens
=
set
(
all_tokens
)
proximity_index
=
{}
for
token
in
all_tokens
:
dict_doc_position
=
{}
for
n
in
range
(
N_DOC
):
if
(
token
in
tokens_doc
[
n
]):
dict_doc_position
[
all_doc_no
[
n
]
.
firstChild
.
data
]
=
[
i
+
1
for
i
,
j
in
zip
(
count
(),
tokens_doc
[
n
])
if
j
==
token
]
proximity_index
[
token
]
=
dict_doc_position
proximity_index
=
collections
.
OrderedDict
(
sorted
(
proximity_index
.
items
()))
import
json
indexnya
=
json
.
loads
(
json
.
dumps
(
proximity_index
))
res
=
main
.
indexing
(
N_DOC
,
tokens_doc
,
all_doc_no
)
words
=
indexnya
.
keys
()
freq
=
indexnya
.
values
()
context
=
{
"words"
:
words
,
"freq"
:
freq
}
context
=
{
"res"
:
res
,
}
return
render
(
request
,
'apps/indexing.html'
,
context
)
def
index
(
request
):
return
render
(
request
,
'apps/index.html'
)
def
lyric
(
request
,
id
):
text
,
judul
=
main
.
detail
(
id
)
content
=
{
'no'
:
id
,
'judul'
:
judul
,
'text'
:
text
}
return
render
(
request
,
'apps/lyric.html'
,
content
)
def
result
(
request
):
#%%
# proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
# for key, value in proximity_index.items():
# # print (key, value)
dcmnt_xml
=
minidom
.
parse
(
"InvertedIndexSimulator/data/dataset_STBI.xml"
)
if
request
.
method
==
'POST'
:
query
=
request
.
POST
[
'querysearch'
]
hasil
=
main
.
main
(
query
)
res
=
main
.
searching
(
dcmnt_xml
,
query
)
content
=
{
'
hasil'
:
hasil
,
content
=
{
'
res'
:
res
,
'query'
:
query
}
return
render
(
request
,
'apps/result.html'
,
content
)
def
lyric
(
request
,
id
):
lyrics
,
judul
=
main
.
detail
(
id
)
content
=
{
'no'
:
id
,
'judul'
:
judul
,
'lyrics'
:
lyrics
,
}
return
render
(
request
,
'apps/lyric.html'
,
content
)
\ No newline at end of file
SearchEngine/SearchEngine/__pycache__/urls.cpython-37.pyc
View file @
8f565798
No preview for this file type
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment