{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Kelompok 3 | Search Engine with Inverted Index Simulator Based on Billboard Songs Collection\n", " - 12S16003 Maria H. Siallagan\n", " - 12S16026 Yolanda Nainggolan\n", " - 12S16036 Prima Hutapea\n", " - 12S16049 Rosa Delima Mendrofa" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: 'dataset_STBI.xml'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m<ipython-input-1-0410f424fcaa>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfeature_extraction\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mCountVectorizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mminidom\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mminidom\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mdcmnt_xml\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mminidom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"dataset_STBI.xml\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32m~\\Anaconda3\\lib\\xml\\dom\\minidom.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(file, parser, bufsize)\u001b[0m\n\u001b[0;32m 1956\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mbufsize\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1957\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mexpatbuilder\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1958\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mexpatbuilder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1959\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1960\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mpulldom\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32m~\\Anaconda3\\lib\\xml\\dom\\expatbuilder.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(file, namespaces)\u001b[0m\n\u001b[0;32m 908\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 909\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 910\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'rb'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfp\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 911\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbuilder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparseFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 912\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'dataset_STBI.xml'" ] } ], "source": [ "import string\n", "import re\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "import xml.dom.minidom as minidom\n", "dcmnt_xml = minidom.parse(\"dataset_STBI.xml\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')\n", "all_profile = dcmnt_xml.getElementsByTagName('SONG')\n", "all_date = dcmnt_xml.getElementsByTagName('ARTIST')\n", "all_text = dcmnt_xml.getElementsByTagName('LYRICS')\n", "all_pub = dcmnt_xml.getElementsByTagName('PUB')\n", "all_page = dcmnt_xml.getElementsByTagName('PAGE')\n", "\n", "N_DOC_sample = len(all_doc_no)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "all_sentence_doc_sample = []\n", "for i in range(N_DOC_sample):\n", " sentence_doc_sample = ' '+ all_text[i].firstChild.data\n", " all_sentence_doc_sample.append(sentence_doc_sample)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preprocessing " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "tokens_doc = []" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def remove_punc_tokenize(sentence):\n", " tokens = []\n", " for punctuation in string.punctuation:\n", " sentence = sentence.replace(punctuation,\" \")\n", " \n", " sentence = re.sub(r'^https?:\\/\\/.*[\\r\\n]*', '', sentence, flags=re.MULTILINE)\n", " for w in CountVectorizer().build_tokenizer()(sentence):\n", " tokens.append(w)\n", " return tokens" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "for i in range(N_DOC):\n", " tokens_doc.append(remove_punc_tokenize(all_sentence_doc_sample[i]))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = set(stopwords.words('english'))\n", "def stop_word_token(tokens):\n", " tokens = [w for w in tokens if not w in stop_words]\n", " return tokens\n", "\n", "for i in range(N_DOC):\n", " tokens_doc[i] = stop_word_token(tokens_doc[i])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "for i in range(N_DOC):\n", " tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from nltk.stem import PorterStemmer\n", "stemmer = PorterStemmer()\n", "def stemming(tokens):\n", " for i in range(0, len(tokens)):\n", " if (tokens[i] != stemmer.stem(tokens[i])):\n", " tokens[i] = stemmer.stem(tokens[i])\n", " return tokens\n", "\n", "\n", "for i in range(N_DOC):\n", " tokens_doc[i] = stemming(tokens_doc[i])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_tokens = []\n", "for i in range(N_DOC):\n", " for w in tokens_doc[i]:\n", " all_tokens.append(w)\n", "\n", "new_sentence = ' '.join([w for w in all_tokens])\n", "\n", "for w in CountVectorizer().build_tokenizer()(new_sentence):\n", " all_tokens.append(w)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_tokens = set(all_tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from itertools import count\n", "try: \n", " from itertools import izip as zip\n", "except ImportError:\n", " pass\n", "proximity_index = {}\n", "for token in all_tokens:\n", " dict_doc_position = {}\n", " for n in range(N_DOC):\n", " if(token in tokens_doc[n]):\n", " dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]\n", " proximity_index[token] = dict_doc_position" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import collections\n", "proximity_index = collections.OrderedDict(sorted(proximity_index.items()))\n", "for key, value in proximity_index.items():\n", " print (key, value)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }