From 7771dffa6ec46cf83ce1efd7686dedbdf8a79406 Mon Sep 17 00:00:00 2001 From: The Coding Sloth <143575542+The-CodingSloth@users.noreply.github.com> Date: Thu, 7 Nov 2024 17:30:24 -0500 Subject: [PATCH] first commit --- .gitignore | 164 ++++++++++++ LICENSE | 21 ++ README.md | 93 +++++++ client/images/google_camera.svg | 1 + client/images/google_mic.svg | 1 + client/images/google_search_icon.svg | 1 + client/images/sloth_search.png | Bin 0 -> 22557 bytes client/index.html | 77 ++++++ client/search.html | 119 +++++++++ client/styles.css | 247 ++++++++++++++++++ search/complete_examples/advanced_pagerank.py | 239 +++++++++++++++++ search/complete_examples/simple_pagerank.py | 110 ++++++++ search/crawling/advanced_crawler.py | 224 ++++++++++++++++ search/crawling/simple_crawler.py | 65 +++++ search/indexing/__init__.py | 0 search/indexing/advanced_indexing.py | 73 ++++++ search/indexing/simple_indexing.py | 34 +++ search/serving/pagerank.py | 34 +++ server/google_search_api.py | 136 ++++++++++ 19 files changed, 1639 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 client/images/google_camera.svg create mode 100644 client/images/google_mic.svg create mode 100644 client/images/google_search_icon.svg create mode 100644 client/images/sloth_search.png create mode 100644 client/index.html create mode 100644 client/search.html create mode 100644 client/styles.css create mode 100644 search/complete_examples/advanced_pagerank.py create mode 100644 search/complete_examples/simple_pagerank.py create mode 100644 search/crawling/advanced_crawler.py create mode 100644 search/crawling/simple_crawler.py create mode 100644 search/indexing/__init__.py create mode 100644 search/indexing/advanced_indexing.py create mode 100644 search/indexing/simple_indexing.py create mode 100644 search/serving/pagerank.py create mode 100644 server/google_search_api.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ba8d32 --- /dev/null +++ b/.gitignore @@ -0,0 +1,164 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +.DS_Store \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e7e63bb --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 The Coding Sloth + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..496d943 --- /dev/null +++ b/README.md @@ -0,0 +1,93 @@ +# Sloth Search - A Google-like Search Engine Clone + +Sloth Search is a project that aims to recreate Google, including crawling, indexing, and serving results through a user-friendly front-end interface. The project consists of three main components: the Client, Search, and Server. +[Check out the video for a full explanation here](https://youtu.be/WCpimlH0Kck?si=_zFzrb1cxZinWKo3) + +## Project Structure + +The project is divided into the following folders: + +- **Client**: Contains the front-end code, providing a user interface similar to Google search, where users can enter queries and view search results. +- **Search**: Contains the core components of Sloth Search, which replicate the three main parts of Google: + - **Crawling**: The web crawler that collects information from the web. + - **Indexing**: Processing and storing the content collected by the crawler for efficient searching. + - **Serving (PageRank)**: Serving search results based on their relevance and PageRank algorithm. +- **Server**: Contains the search API used to handle client requests and provide search results. + +## Installation and Setup + +1. **Clone the Repository** + + ```sh + git clone + cd sloth-search + ``` + +2. ## Install the necessary Python dependencies, run: + +```sh +pip install -r requirements.txt +``` + +3. **Client Setup** + + - The client contains the HTML, CSS, and JavaScript code to run the front-end. + - Open the `index.html` file in your browser, or use a static file server to serve the client code locally. + - You can also use the live server extension. + +4. **Search Setup** + +- The `Search` directory contains the code for crawling, indexing, and serving. +- You can start the process by running: + ```sh + python search/complete_exmaples/advanced_pagerank.py + ``` +- This will crawl, index, and prepare the content for searching. +- If you want to run any other files do the same process: + +```sh +python search/ +``` + +4. **Search Setup** + - The server uses Flask to provide an API for search queries. + - Start the Flask server by navigating to the `Server` directory and running: + ```sh + python google_search_api.py + ``` + +## How It Works + +1. **Crawling** + + - The crawler starts with a set of seed URLs and collects links and content from the web. + - It respects `robots.txt` to avoid being blocked and to ensure ethical crawling. + - Parsed data is stored in a format ready for indexing. + +2. **Indexing** + + - The indexing module processes the crawled pages. + - The content is tokenized, cleaned, stemmed, and stop words are removed using the NLTK library. + - The resulting indexed data is saved to be used by the search API. + +3. **Serving and PageRank** + - The PageRank algorithm is used to rank pages based on their importance. + - When a user searches for a query through the client, the server uses the indexed data and PageRank scores to return the most relevant pages. + +## Important Notes + +- **Respecting Websites**: The crawler respects `robots.txt` rules. Please make sure not to overload any websites. +- **PageRank Algorithm**: The implementation of the PageRank algorithm uses an iterative approach to rank pages based on the links. +- **Data Storage**: The crawler and indexer use CSV files for data storage (`advanced_pagerank_inverted_index.csv` and `advanced_pagerank.csv`). Make sure these files are writable during execution. + +## Contributing + +Contributions are welcome! If you'd like to contribute to the development of Sloth Search, feel free to fork the repository, make changes, and submit a pull request. + +## License + +This project is open-source and available under the MIT License. + +If you have any questions or suggestions, feel free to contact me. + +Happy Searching with Sloth Search! 🦥🔍 diff --git a/client/images/google_camera.svg b/client/images/google_camera.svg new file mode 100644 index 0000000..c064e9e --- /dev/null +++ b/client/images/google_camera.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/client/images/google_mic.svg b/client/images/google_mic.svg new file mode 100644 index 0000000..1ca386e --- /dev/null +++ b/client/images/google_mic.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/client/images/google_search_icon.svg b/client/images/google_search_icon.svg new file mode 100644 index 0000000..6e03b6c --- /dev/null +++ b/client/images/google_search_icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/client/images/sloth_search.png b/client/images/sloth_search.png new file mode 100644 index 0000000000000000000000000000000000000000..68260d0bce845173b3411e5bd89a6ece9a66beb4 GIT binary patch literal 22557 zcmeEu1ydct5-x7R3Bldn-QC@SYjAfvxVr{-_uwwUA-Esh-R0o$xcAok6R%#?*6wUg z)l65=B7?O;%xGESJxX)L62oC1!-W;jg@$~?AQI!$}tDV9> z0RtljlMxqD_cS;MzpV0Z@K+cPwS!>p7?rq%1_oW0 zIROH^KVBG)1XTwP3POgs0Ct{)(+IK!98b^jN|W#6VruH`bL+g;HAw@c9y5M^hmB|D z;eMC{^yqn}1qWC`u|wd3#|oE81-QPToq}8c?;XstVL4iuEFWzK&J`Sr9q17#vm~|v zXZh8edH#Sq^WT;}7`OsR-zez;mE%8S1W@{P!~AbFEMehf09Lx0fOu?xxpXUD|rMM=5V$>76KEXO>~8v>4tzv~le>lRY)WD-ehz$gyLWP% z+!4a_{i8USk*2OnQqD+8a1~y`51|W?`PwS$%mETe0p&8)6V!vwg&{}gXiH>S!o8OZ zxC_aC?8Wg;lC9X!4FU=sm(h~M1W?5(IA96nA)bYCJ-p-)`nl&qADO)m&y*}q51MGY zodcM7?(jM~6oKdnFrlYH)MILe(mgMtb7|;(sbHdDZK4oZn9yDzF$WaREBTU2ux#|;#2tzWp-ZbsH$p(=_dAc+1E9Ut@pQyR6 z`JGhz$#jZuw{t}AuQtRQ;UemhEO?NBV?FmQ1PY<4LX>>h+Y#kTg5SdRTA3hB>{?zs z*zWjQ)*A@=gxBGM zdKbflBXsWLzV+#YYqLO*y|QxuNf#XxZZVek=S@f5ZF%PEhxSxF`$xVjj&wv_W zR`12laAhnp>u08~@^`Y`##~-8o4dEveN=1R5=2W-m7Ml};5}PJx)L>KhfG=zj4-C2 z8{x?HmXWxH(&K+&g=sMrOPu|7nekqgB`F^5w<##K((Ju>FzWZ9`55S-)o#)IkpWrN z>bS;?$Tt4n)J04B5J5=?1r9$$1j^2aL@TYlW{4$?#1PL`XEtI2{w^^Aw2Ux#_d&=oem+X87 zC3Lw5t8#H2C$ch8kU}K8%AzSQdMRS$b|GvU zUoxk1aL{+v*v2Ia;c}!XblQ^rAXW1RI&?_Jc2*He`AOtgL4e_rC>X_Sd) ziwE>imH7xBy#Tt%eDNGXfs=JI4@Qdg9`QU*AS}Ay=@=N8ElJ;ZuojU%vy>H?smWbm z2r9*KRH-<&VKb>Qi<<6`@yi~C-|rVH5p-jM@C)UJw0B!ov+eX`xq@`8_$`IpU0|8>GfQKp1|0 z(=$Imqxm2S2>^&WA7Tcra*95;`e!vYK?#fAx}H?XQFRVQc~!`ycLa&qH#&M% zR*FWB*W$I)pKud^3(82dJQb-12X%`&av!BwmyyVC9KhCOnyUq~Uh_2m+k$0uCZ>-h zNLb1?Iw*<|n!4{PMmDZDG~PBGY}B)4JR>^Bxqs~TLDp`l_Ux+La^&KO{bYmYBfrfk zdwe3iO#EWH<`Ayb6^Map0yl7j4FY35A^^Gog&905Wg!)WF+Yhy%S}vbAYRQP(T3uP zn9-&U*O)|>4Y`k`0Ga>Bw_eKog^HTSD|nA=&x^dU;i%~=PqHgtfy*Yd-e2+f*V}j+ zhTRWdNvotmwTZG)d4=Ipq9b$~RC%&5R#lFojPa-LkXp?VBB3%2-fx4pPA`8GXL(d^ zy+&JbyBSi?H_W6tkO{|VFuw$8vfPAFlr0t4DPrn~jf)H*^(h&yV3}Dun$@t&ciXUb z$o`O2YTGN$GZbRiDMxC$FnZym02bBWy41vBp*bSvt0Xk8w4F7q{eh z*(#41?oFx!X#0?gds;vl&MCK{2}amZtlFv*xtp_Hl7J{OPAEuG1~lzY`pEyZCD6`Ci#VOX?~`9#Zl`K7b-T#% zckAz!>cbPb6D)16jy29_;6k!6Mp@9RKq44E4^fECkHc?%hZA~&F65D|u3pe)L{c1z zdr5$YuoC_V>rp~C5W}UE+{i3Wp1+lqR|!QS&Vm+=cjb^ek)aa%-(+f%5Cob~5%?p$ z^!E-m7Ar@y-tuJ~c@UL>Kky5+e7t-3?oBLiybXrkQiLySWsf~CLqGBdnG=s`I@X}1 zreWZS#(18B6N}x(c)`K2Sqb(2UL}&>$fjPaQM`9dKexi>BA(hDhNy;J+8!?rUqIyv zl8#Ra@!GhlFybsFL8pe9847Ijni)#Umb%)=5ZyOEkX?7LagFM`GcucldFV4!Yq%E! z3bRZ0Q_#aelx!w5;;9VCcthJA$&I{lKtqq&nV+*n_q>#x7(b;f=j z5(fTf^PP1YZD}e|?fiN}IS-O=i}Pjb&YH9Y@@SC6b`XQfr;)0y)R@SX29<6PbFZx_ zM&BIvP}8Xyiuo*v-@m1tBM#)ZG}X-9Z{<;mR#Dl%RV*6dh-dG~ij$gw{@ zB|IPmrem4zZ=QhhMA*{9Mv%215j?ubUhb)#&dUe`vC zi0%C7@fYrueQ|?-U31K?at`jdANV$GmhFz-#~G|lis_}90q-{Ydv>PE; zeW>v6wlXl;?=>-yjd0IxJlLfopv6hD_F`#}VPX=ESmvGmyZjzpw)5R`ZPIy0!R?r* zXe3d_X*6hD0*m1g3JNR9+st7At%&Sx7^wX^Bv)Bt=Km@qB;@7UbWP+f;Clz>irZk= zD3|LpkHL3k*vi*mfhI!G#%fyvUysmYGI&7t`bX}Pn{02>wLx9*Hfchyspa>SIhu51?EVIN-b$vvp^*cM;RlLI1Y2ts;I#gqZ25H7mJ0CW zt+M#OiqOJa z6bMUypKG_|_Id}Zs$)>XPuxem+bSB6fJ`epe$)`8Rq9D&R1gh8i%^rp=VF1J5+W_q z9&Y=Q)J>h+YD7bm!CJ3Pd>oD|(U*LFM{T;0e2zD&yI)4~r=3nEmW;Lv-+K@nhRYqb zq4y%>EFK%}>NC>-`3t={Vdq!A1+qPD^x(g3U{18Mu3a8gbxG@Y(J~>9$X;eRJaSS6 zx;RP?0wq8jpFgwPR?v@MPMXX0PE)505rV8nB95ZtX>|`GMIxC62pEUDp893_W?=^h zIjD}BIDlo|dvHyzMo@M)UC<86Ni^XtFoYekt6uWfN6w&gEorOG&+E&|gDgZ^xVyJhy!N1V_Mfg$ z-%{X~o5Mjcep_A}nI()?#~P1J4O{-^PA0NRY?uL|9E$3-zU02_hqBC05Qqrtk0qtZ z9wbb`eQn^S*lT&3F_#z_oE_=amQ$)$VP^3_ri!{osRsXy6nUOfo~xy4_P9P8E`J6O zUa#E1A7g~`O3PxP+`uKWSIa>#*Y0y=GygdxvT&)_7FqE&12qP*e6@u4qt1i2P!xOZ zj)C#Q5L}3EN9pa>^#}-wXBk-<1Xg|T)foHsMyjN_-+$Y$=6eB~t8e`cW=xDosrvKN zX-tT;G;Hd`NUgSG%A-t|#y>G&>*r?QV}wwqOt_HorDy2Mfm(D4@AJiGf2^F8k3hC~A4r{Abf3G#ECBFuo0$>eezH0n1;-8>A$7 zI;1X(8l@_>n8ENFNz4n2Q$kJ#$b`C|@2hoJO?dy%E`_3`L~7_(P{51yk9;5Lcc)NV zt&KeR7XHA2Zu%{jW56e?jF3ObcUI?JI{4fMTBF2sk6&K4G#jY%K6|l1Jj4HGrtr$d z65ng6^aLEue7AW89{;zPhT1ejJ!e3>Jz7snRN2tV;FIV6Wt)5n6r6QbQZxv0?nn&e z@e8*bdDU6X`J237wb{O z5T(m)-H!iKO#DEP(iRqBV67;qm+#V|8}=8{>JLI3fQ2r}49)0=q1>-qB_?|dcrM6A zF6`{$(?s@y4z2XVA^r4uq43FEh4mH)oVE9x_may@f)M0$ocgc_O2x66&){OW1IhiBB=s`yJmq9jTAvC?~M1TwHt7BR-|FvwxgBW6g@y^w^mXRBG zgoe+N@P@a*v{nmpC`vS2UV)%A7x4;jVm{hIcnoLO=}aK2sijZ$dV*PA=8fvXTg4J^ zy7vv8)$cUt!pWAPTPgSK7gy>VV;7eqtaJBZLr4=WeYQ&=Polgl?~AWn*oOx~gZ~G{ z^DNOvqn&L@7LGIs={F9d9#C&9HTJIf3H0 z$d6D-B~SM$0QNezWYH=N^%iW1s3$#e;lH|e0CS`BN`T{ z5OH+GVSfL9GswdH!TPw0nYemFYt1naEtUaUuQxmIO?2!yj2NwMDl(t#;>iA0RB7rKn8t zAVz_Ld-J?KR`G)gZS!o`ejA&gM`~!^tkaH+u_EG@pjM2=4bz5*C81^JU>hh0JQuGn zEG-c(`Pn>>Ttr&dV6~d_<=H29*ZO<@JM32EEKA801iH{@8zd=(a-l%;%E5wLt`Z?9gSHG0de>Rm>>%lizBLy^r_W_}| zbt;4E(nMFehA?RRMoUpI;K?-+$xR~|*rZ&@e9og$nciq3kqq7yVu}hpa($!orUmhV zYqfM0I4nO4#}C=0@Rk%B8yg=-JVnVx@03D*2LCSaV@^e!Np+Q~{lF$K&42&{*PMOB|0gy;pZ*bxMMnp-o#e`Rj8Uk=(Enz|Vh&gifek^W5u`vfT$QVh#l? z`yyK!c3#dXyD<%dmH07J(b&~lnK8*LV=(BKT|hC5qRCLqJ|UFk$Am_L*Y>0IVFJpU ziF|ai*&D_yQ_@UuDC&1OhBBy+osR3m;&ZuWn${S5;6X{J@seqsoqx;v!YtB}B@7zHoT$>$ zG)v}EJF!*+lQwW9BC|P^#&9B_6-fmEtgFxPgk4boeo(#RQQ|U3HjeE0=w}N+uf=j; zcjdE%vALDfT}39q7y(0u9EA27e>V-Tt~zjmn&jDgejOG zZH`N5UUFqfBoxM51WD&=y=-)!t z!r)O@J|0=%nkN}$+MLh)OJv#>UWzEa{yqlBR@4SvWF@|e<4W%9=v&OkOnHG>r9-Bm zNZzuY6oj@bF!(XPX)ww4S)+XEy+{uFkzfzOuS;l-PoXYzf?mRVQneS!2jo^O}T+S86 z&pOsCI>mL)L*9EMU8>fAZLTYXv`op6qd)ZCmc26fO)6U-)<4~E9BbC3xkla6pmZv8iuFvC4Pr{_ULcUK{Py$qZ>FM ze0Sb19;y-GwyF^v-+0@XyZlWBtx#Jur%=ud@u5QS!0|Dw|4#p>v%Kr`X^YzA+&&z#7ET=+h{@(qA|A~xFhVSkpu`<1l5VwvmvhCa$t1-VwYB0RVTIc!zg^K5W#CRzRxZY&n8Xu2y8BK@`_ zsQPlP6osODV9E-v0VPuvj}g2-9+*Yl_F(UU8+9r=2tfB|AvhKj^iHdNdnZC?N zWfZaX_*^^Qimsp(cdw_x0AaBgCcU*qWlPzsWHrUGNA;O5oH*ZTus}V(f*b#dPveLp zR!-1YG7_P)Ka6RQ*|@1Yy?Wj(J_G&~SEaT1vkHqQlv*TlIl2^xI)aC&;GBGGT8`{P z@OMQY!{4$EzBk;e_5R%9sw#DiK#-3Pw9W<$w51BjEYd07?Nw2nn8?~*L8;pS%9f1E zODOy6k!c+-T;k3dkqL2ouoa~nw8{|qhl~i6n?&~UHlE^<`41Y15cw_{vxlbtEeT$4 zk0|9mFAY;HP_APP7B_VVF+er;dS@l2HT)REYaAq%v{qQsS9-cLI?v4xkby3tD2V_k z!PnqL=kmcaQjKSMJBL)l!1yLP!uw*?O|hPI=xsArwfX@p(cKUD^f6yPMe@NeVlTAz zS)wsQmqzMIDL)-r>ggIP`+?&9kd8iYaO@laSd)&R^!1`0n%88j7riOB}{tO%byB*Bxfl*P4Gi;aaU@yIy2;p7ljt% zP>eM)oHxSaPNGM40IYDXoDx^@556Ov!lH8-;y;t7k=-pyWJlQ2U@M-{+}+@;Se| zkq;BIEMH>KFnP%An8(wwA1V=;@ULt}2$2J|svK2G zEmbU-qK*s&gJSYlNk3i5y*(zs==lL6?432`$Xy&c%)h2l2sl-V=e>y~W-?^Je@k@Dmi{3H?XB9}>ARV0lnuX)N#!8WkkU(WX@ZUzwImKa^q1S;$s+?bbAB%@vgdfKzGCmR%2+F6ga;ZH7AhL~DbsniV z%kEg{S4kg<&H*pji5)8B%v2!e`wbrfaIetwg6(;xo9U9rRH}r8dT4bosX$1eLmM68 zI5kj9-{w>BjE4FnMpg{dOYR1xUZvC2#i~;@6Q~kk&<=C8h;*D_pdLeG=AX{F(AFe? z>i=zwFv{O~5tY@+BpN}f&Bj6T*3BbxVPh;XA1)R=o_dn&=z253SFg3kHI3sXhpy;P zpWrSf)2?X<&v9j%RUS_PotpqC?%}k^fjmw*soT)E?UJXYW{SZL%OMt^nMjA#Zyb!E z$3C+Pi$XE`XCEi&yt^?Y1;3j}f9KYgEE(xSK-m`97<*a08l3svm!eXo9V%nT?j;wO zL%9Vcf(dBz)rj?h1)tm zJ9JpU5slaZ4cPDbRCurq@cbYj+*oIjBV75vT61{AGJshI0%^GMQVcsK&R&l|znnAE0x_Am*|x*JQIbjg~1Y`d)@9)~r4f9O9z zgBI`WQp$NSk9>~YT55k&I}#hZJ-XV@)$cL zY!h1g@xz`Ym5R~fU*Mji7i{wQ|IG!Mr0{mEX^QH)thM+% zi&c&CF_BZ`P|*+;Fwx_4q*)6D#q%_|J0hQ&O;oU~;VbmI0#t(dz$k-7cgwON&4&62 zX9n`cd{hK`R-PWvOmV`Txumk~IPZV;c5-NgTBRluU{(V(gM;>zeNTc!_5)~avgX0t z&q6)UnQAj6(eqZ1?I45Llwq{B2=?9s%9Wpd@_KHURKDbJuHf9H4MOn?=jB>g4w@HB zPADtCO=($9A~vB$kv0-NB>Hw3*gJ-}_xt(FGL;c}72TRKmCL3hRXU}xDgI;J!lM!= zleN04aPp=e^Sk;hCSwo*yU0L7@v2^^b(wSwGlm$vS6bF(V%^jH{ZU1!j?vKoum@3T z0D*`6L61$9bLZ4PaE@m_Rd-Buk(#H-NASM+ySCk3;DdnA z3}i~Q>SL&<105$WOL^)!`DeGvqC;VEkBjs7f}^AB+Q1~~5Aj#{z!sWFJaA@6`T=4V zy@Ax<`AV={g%YijsPR6;nMdwq*w%XS>E0Iu>LK_hX+juonGjR=S8I=^ zzCDc!KHNr_B(eo=Yx<*Fd!tpJNL19PIE35>KgxsmmMNQ!q_{78kA~1G5HBU^HgSJe zmr|6h$4|dpdQ~`zs~jArz#8u;s7fdW`@}v4Ldb_=1H;;7CtcvD$4}-5z1Q;o6NM=& zm!on4!R3lELh!8~ee-wI_Qr#D6m%R+ncz$`+9YH~QRP~vCk_}oW+?1ML7z4iPxP2V zY&~UKB@AzLgvc!LbTobhWw14gctZGRZQ@}RBW5=^fZIO!Aw1;T6E;w^0c$ADz(5{3 zy@O>V^q-dDMZU=IC|03vcxo)ZUII+8y($M>js%}t)?*C`kq*mc33REy8&K$D5X34< zTXeMG*|G_Yl}3(25nu(B(t(3lFCabOeHAw1T4r_O?*sQtO_OCr;!2!#dCZ z_?A>q7-YM)JkJEefp^3mxsq=H^6x(7wJu^vea|C58zlca8Xz?uDP zGLfM?dLH;Mm4trKA}x|^A&ue{hcgLSGp1=LHnl8tBo;&J4H}lzE~I8(dG4nW&r1IS z80kZP%D0Nn{YBGXCW}J9oMBHlY0!o2(cKm#6|(91!&0_)Bl~~%$!GA<8UKD`tEzAJ zb`E7X1e7&JiKUQw71XXS19bxub5tw2>2$%iT1GA}Wv$V!isz3Hw9)y?SouZFCE@pv zm=%lr28~Z+WXVg@lsn3{2W8*yA;%9xBB5T6I_?*OJM>xnz}jmnJYag2#2U8BWKw3% zC_TlrGOHv*iP8%-!zVD1b4bw`S)UU}k2A&8=aVj*utwa;}c!Oqskrlqs^*|mBH&<{Ef zhJU(3_&7Qel+Ueq#*oDrjLS+#)~40(DT~({*1aEw8_n-2mS54^>o9-Y{(e)047mu7 zt$#cV%a)ScKzb5U-S0adkBf!LCM8*l`AA%o8+nsO>~l2Vh~NHx6(b$PcgIbOG>wDR{ zd>NYfEA$%j!{GZk0fx&bdkYKx=UGidw#AEt+zeG7Ex=iTsF;_c1Hv-|&76V<|EDdv zpNAx6Dfe)iXG2)Z{s2m#wzjPDn0cwbaDY1o;|V0xF7d&OG^MU!0m`|oRGTF{a9ISC zEFFhr9K~=yUEB4GD6z55gLJ?Vte$4dsx*HX*CKNDae3u4qzB&0)aio^CRkFeVJ%Ip zh-)M2z|@f_^zodnTH0>@$ZgMTVyG~!P}DL?fVhjQT1N?|8Jb)C{tTDkxzpto5X7RY z!CCP^Tzk^MX>e+l1gj4k-d&V=$Au`EX%M)QYIt%}o7SVzvnho^+u=1k(pMxIK&*>Q2&Ky^g=;58}`m#Q4 zG9}GGLk4YAf&<`J2y&GS+o9Uk-undOwXCW|rlU$vH88SiHuoqeqgz@?C{V^t-6RUb z2wUhq#cA1-IG~9F6%kDeK*@z@%^PQma;9)%HQ(St#Dn|Ns@Y%qt^k_4ZPahSkP$Is zP3Y)v(=1&N24M`7aE6G~5yeCzzgtzc)#%VMVav9BoVn0~s7K~Xkp?U1G#y>(!T+}Z z^KLwj$f~6RO!d^4*mU5eR*JKWYWF*z4@j}y4bL}#{Gfqc8%6aVL94^Opwu5#2 zc#}aa63~F*8D@gz{p;InafP;zQjQrSafo{JchHaI_pax6>9N0A1NUOTdtIcRPk0=Z zZJLQ;#1>&S+G{g^pH{^dq9dLTOV0hgL1B~L`Ncr!Z{2%v8p<=KfRcjio5h4gYs@%k zUc4@7CoI5X%;_o=mbsT_U!VJ!qTA!{zUq~Y^S#J!Kn8>cNn2^_vxIHFIpZ{HuoFw9 zr1Trsg1~WgtPGK-a){^fN*lBiSEkvWuEt3FMGrDpEp$qGv4IEPJd^Dlu^r~lFfA2n zsOp7Z{pE78NTYmJA2obm$o*WxnsU%|NHy2S3>+_34&(@P=+i*3thj%c?Ri8}A{j00 zz!uXdV8LCQ@gD2)y$HcTttL($sF-&~FH>agC{a@>?>T4`8Z)Tw{1yMcXvY(_qA;6B zSo{a9JP)CuVyGr)vyUTOr+zp3HEB0`ReY{N(~t$5lfP>n)hClZYUv(Zzy%62D9ixX zvl!dN^ooi6nQi`ygqHZr#2uIDHLlQz#u4Xybb}D5rm6jF0L_$i_X`p_5;+SC6HQ5G zC^)PLTLggfr08LQzq8sE;@#_Fr{>Xm$NtXT5T6kAJUh6}CT>{Zk8|%TzeVxya?Ehv zuW-~+4}pAfFTma?wwE02!yJNlLIseU`E~LG>s3)*j*XzfkzSWq9s9p3VRgz>*>JYX z5Jz>$6~;sK$7K1EN-tb()QU?hPhEYoI$jx6)Fy{Gn`uGr5y4C0%z1Rc?gzQgYtdfv zpQB#NL~Pa4^9^I9#ug}M{UR!h_i1iM)_X0c<^!o;3|TJVz~?+4Boh~!gu$#As1!3M zVd$`SQS9~@`jyjmKb+(jyMvUl>PW! zEkZAIP~VRwMCqZo++Xn&pFO7(jfnaVS#9bt^6Ybkg$c}#cz*s9NSFhSu;*#s zJqi!?HU3)*vJPjDt!E&iCs}vhF3#t&_$BUW%7{O17D69<#FqJ* zyw?%ClObe5mQ=!Zh*NefU0gGbaif68{J&=2E+VUjW=#LK&J*8`voFHlIkafV1w7

6Zi9wTXK2lH^_W$X{-VL;QKkG z&Wg&WHZW4o!8n30=HLWiI)B+@-C$Z8X3|5vFF~L)Dg@vJ?-fxU0a}RrAF6oAg7G_v z`$|nU#k*Y?E-Z!5^;pk9xzh|PFNuE&V7WT4)q5vl(b?_(vF?3Rr|B!NFEJ5ZW(_8+ z#T5(k#CqCFzrmyU8xLx@;ZzrX=_Bn4I{yj<3~=@QrcUWsorh43UbA)8XoWQTRA_^E z&F4|UIR6e+9MZCL5% zm{6ACiMVcW{{GQ~lnkXRq^lHD|Qzrmd()dXJ%1ximf?^KJ znWOOhBZt_)@B^o%-7?Wkt}5v+&DUzHs|f$kA+`6_PgMv?=|;GhlVv}iLzK}QfAu;i zEG!yEO}(bLoRFPv?mT~!zjL$QGOJ-VUxC@xfrW;U^aoaTd;He{Q;Kr{K}3_<$U?w) z<)Pctb9*(@AWSw}1Y%pQHvDoXU#kXDo|k~_`&zTPIP0<-^J+^VK5uT+Bv)ZoaMrgB zBV@g6CElkaDvrnZuRN$};eV?&W`+P>3AbX$0w#ulthf$d2j9)U?#ZFb`lFAl;PCzA zDg!Vs(!WmlZnXB)Vw9Ds$`1OkrfNN@xFD+#k{O%?hqNzQ+s}A9!?0 ztqv;`sa#RxD@PPMVjM6qmcO2+i+@X2Q+3ysabAmdf|C{_#PKl0+{KN_@=@5ZjT&g9 zRI1^Jr<~WcN_I-#(rJ9wbg(C--B)i?OL1^oms=|)utA%^c6Vrq_*Lrg*7xVW6mdQs>~lf{iFqXC~`iU6VAm>&YdPS z>M((5?F9PL8p90N_!lhqO>sShZu;Mg9se?uCdeb?3&dAND0rIp`iL3z);wB}OF!>o zm;Sj{=XoeP!`p^DArx?=*(Wg1z%YQz!Rnz)?7me@MrS34p+Xj7b}O zyDd8~6!Q5YEAcD&w5`J|51txm)1UG+sa4$ zB6-^Y75gj+rX~FN6TOG8NEdisVczQ6hpgRIteH%c1mJEkeD__@@sAtYw@-zI>l^Ym z$aV1Ura@<(n}xP!mgYrceg!yKm`wl#=5VxZHf#Wng>)X>0U63770wHr`{DlF+; zNSQ;Ej=bcA&EkTOMdUIW=-o8OHu;(hc;`d;laP*-_yGsmBqK5QH9S>oQ(|ErA1&k*((7GN~2)UEgR)yzJhjut5ww6uXodj{;;bGl;!QA%$gz>yF1)Vp-!D?r&;sxX22I!m9YBxsz)=GpOzJDz z`15*88I)BdKkt=L5ByCZ*(}T_KgXf@+xpaLkqRXct(_=jw0M=1&fVFp-9>Pp;g%aR z;I@14?BI7GEuMThemN&D`8#na1g`a@G*l(M9O6Y{u~y#Z*(}M%SbTr~x@bfAE-Fpj zLFrbWDWK)r0H(yxT!sQzRh=43B}BH3-A$tTw)Z(FqSYE%CaK1)ooAqYYVS>6X4UYx z!IWx^Fap!3?SLj^3ut_A{337ZA~)~)j8g*lcxS>72bkizZCzzR$EgaF5-$FAV}kt7 zlfUKrdquaNk99%ugg#}bJGAz2lJJtZj=l6I1KUQ~@qMINVYjTu$vNMT2sVNVSt`d8 zrcXJ)D9Y1P3~3N4ur4Bu^cBg&@E4%|+8pNBm&=v}$VdKbR2()Zf+6hx@RXm&FBZip z_wUE74jpa#8H+(6!QIit%^&r z@&#sW)1Ba0>f1J#3*#iAf~2YDJZC#QFg&~&7|roUcb^l{x2TlT+i1oh?*i?;3K!UH znvroQ96w z$FxbrVGZ7}ik=#XI3p$S))VwtJg~N|k1k0@H<=tFH1)!OvN%*55^oU8M-AIIF$&Bwb!1WpO+!4 z`)P~hnBU#|(9(F77R$=nV!8&0A>MR!5x+qMIP!i$pnEFIe%iqN-}>guWo(AgU!qFhOgZe$mD6jLG#YsiC>aiX-bCqO(C%zbl)DvazGv92u_n3~; zBUaW15tIJaTq+*EN(OllOMhVuSgOofmp0F#3`V`)*hic{;{624ZvXu#Dg~Q=Z?u#iCEj~bwP_Eh=Ou&c z@Q6wL2piY#v&RqL3-iP)5DG8;Bx>dkFe!zVxd zzDhi%JjzD>hFp|s1(xM5V&|D_Ck?7=*feU^#*ea|cx>GJyZC}!c7VG$CYC5nwsEnP?q)G+8#pM>L-*tbv82Mf!~Jj6oXmcNm1 z)~~c&6!WyiBYEU4v$^6le|cM(DK;`7l2Jk_-G{A@q)38eiz^BdGNfl9?Cf6=me?yrcDZARb zr;hotLN5h&=vUa;F8`f!fjjeDOu?xU%cgwkaX>*AG|JYJGQI`q$~~h07!TCQ;&GHA z=`SY^IDipxyni2>Q9#_G1dF2^?1R`o!q0v98?(~{qzRcP*c7EXX;_on*wBCxS6?va zZir_dt|^|Be5~I_IeN6VY0&Z}`<0)=)oP)Z)J^)z6(%#F9S?5@DfIeX?4pO7W{gk` z&jp^RuQ6t*z_FbB`4{G%+#V)Zv9;A4bQ}I}u z@i50V17o)J5tZg!WLEcnF2*o`EC<4S9Muvsn6_a2NScpdUQ;$#$wD_VF+#~5yb(Y8 zwWYSI`MprZxo92JeS%&7pKQNO^4`*$g9IDA3S-dg0q@RI_mn|&fjP`8bp%#Y`!uE* z@_wz26gHdP+7Htg@*#nvM+63FF13a_lGi-F#bEjDmHn|117~sc@lCc!SJjbOGfbvz zs%899(@?Ajs$fQ|1m>G!Oe(KCuMm$gz z&g6xb%c$UCDur#@SGpaEPZ2{eQ%xgEF9aVwQ0OI&?(=?ElW^>KD#EoJ${;;c0-{V@ zqSEA+CDn~}roS?q;HO^@>U~&Xp>IGV%Cz6}AGH1B>jaL^snBS0-G;V_3RlGSkdz}i zF%0D#%B2Ta(7H_jlPK&Zca_C4j}0YonOwBfQ3O1L^_P|!nY(5#)6iFHM`~*N+r<4i z!_xSLVHx}{Pt9j1TP{+|>13fu!t0r*(#`qI-a7A3I2qZkCbF2!SgILSwm>UlVXs|j zaK^D8v$L)76o4sv_dm)%8%9`)ylGHX$1yl3@P`JOj+*KkLbl4y>K3^R1Y>6g(JGtY zJ?GD=ieIy+xK!4T36FJIR8sSP)qDSRE0(J!s9Glk-ymXG3lo!)d|r1Ld3_uwwPV!Y ze0q!`>Us|KR7V$UQ#Dc5mv}<@dsrra-Q~1iEa1kR-uq^d<+-;|))>A^$2dtm_GfiXMYJOKnI7~^yjcKgC^ z8XEH$BgZwAhUO_ku%_FXF~0{zubuActM=DqB=NR}=yamvo?c#&gig)9i*%)zSs0iAu6_c-4BO{7D*ayE0j5vRM+jfQdX*?USO8 zALyjRU}-9r?eo-^)z3HCYxH0#jWs#%)3fX^*O0JAWj~Fut6GuOnTiI@MmGt*F$4|w z&c({p2r^QyYnDgZdJpwKo!2U7ofjA`b$!t|&V*syJ1$ZF8fA>&zh)pa!kk-8OE(=r zY@YK!nD+TvIne8mf07zln!Rp%GNzQxAWiOi8AzISfcul%41aM~8E;6S=GlL&OE`s( zYu($KPp#*EFJB*rv$BezmehLw)X&6jaW`ahKGy<6!s{!dq~_anr~3fqa*n;y*B?OG z4;z`qxsV!l?m&KExj`pyij-_3fy`WTsk!dYX4>~pR-#eT zltF}=R)hQ~B3O`xw!}(tN8{zLkBQ*?oR*h6#g?18#v{wwuSx%y=+oA6BXE?*aGMjB z?eJkKp)tw7{oHxZ!yfRPFKkxrPg+#>;th6kxt_OEwG6?5dFSQGPF8~Vh zoMb2?{HV)wl-b$VMrjI`XmoVaA$7HrHba3XJsszdV%#ghKwB;TchH{WaZc*@Bf<^S z$HXET%e?z9`ta8s!e|?gPyhB#ZI3(FmZK7~n;Z^mCRhTF0kJBX6$Z0d0WGin&)R_W zENzn^6mPDg8fp3J=gF$Ex=%+a7tTuhA-}CmY2856K4BKM07u)Ymm!nAD_n8s-uc3D zovm)WqII3rB4;AwTjq(jlbYyNOW0%RDVSP1Prp*LEk1gSJ{k%$Kzh5?m_Us%$j%{u zk@ruxYd9mO+&+o@@*gP8XPL;PR7-#7r!9wyMqKVNj zdz4sX@khT~N7Dac%h_f_F=%XBH4>Vu5Jrsx>HvOHTKuQxSA6f!pi=Le&HXjRtyg&l zbimhP=$8YKhOFraOjl7+3JG4(So=?Qty086hZNNRUP|!m7Lc0Lc!V1h-p6m^<9zOg zu2Fb086eVHIlIJ{NH(l=PRu1$vU^bIi==?B|De|-f~4)746yw6b{3qLN0C`(P@Ey1Cg8qMiA%;JHuh5B4iy8wOoB%s*K3!xMl=H}nkL0JivJvh)un@Q{i2wPO2mb2S8 z^~qd!>HU*cLuonXEqNj+d#p6W%_%!3H@2+#hv8+Wp?@Ej0DB<$@{SjyG7ZNG?rHam zR;dh@o@O5zWL?`d_Mh$D-|p6%kbL{&HE4ZcH7UskVw08kD?RmDMJhs5dfeFFd~sl4 zSiS%GFF*@9Xe~NE#$a}?haKb_PVB<~4A+4m$US#TY5nBe>AAJSY}TKL5dA}{&H!PB z0RLHDy&(Lw^J@ew3{>iPJUU82CC`3kd-~kb>whWsTHV^?c=>7)ZRSH;-$i_oem&u@ zB+ROi5LODmm z0FDL}t_D~AY$+`{%Ezqx)0&VX=8KBmt;m;*)O}O$I;>YEWP>&2IW*w2Uz>(=O;8bP z3UQvrNFMyk_Di%wFJc0op4Uz!y2~->W8-?k;CV9*-4@MCepW*u`@0)5<#7Qz`E(Ql zuZyeeNh?Z(pN;3%3)x>SyHqs;!{1=6hI+2zhN9Raj zaA&g8*Sh(uNjE4_2(r(xid0YmKldvs8x58&lx~YKlFc6FQc%bE>+m@+MFdtCVrz3v z_BU-kv$DVS-3IQ*FquM25sF7)g@CNmoY7o*-yRM3Yq^d_qcDZR9&cc{a{4m$wlZ&C zW0Plda6_@Wr^4v7Gp6v_U9a*C8U+FFGDGRIR8jEyjzZPC3+8#T(i*|HI*MH)9?zeldy{>cKoO|!Jug`of#VkI5sj?ThN# z=dpRB1+1Bmkbjp5I(%UjM;~xO)w+Gkf3?B1K&n9mC*ZYxPnE_kfZ7k_({k5G-bRMe z8aNjL4Bb}9@TKU(wZ7*i{d6_S(QT+3ENf?1#i(||3D&yxaRL?08a$>8RjH+1{76nT zr^)Lk=%pkrOY>LJGrF|>eu!{4cuFTkV@c)f7${uPoy|_awRZ0vU;eIdddrn)LLY%o zL@TIFH`k1+=hETD#a5J>jH7GVeE?vv%z1W7H$ovuUlo)`2gc)V`5dKM*~YBSFr7_L zn7xdu>zGz-G}2yyBqi07|1G|iV7ml$D3wqXVD=JDD=c)o7SnwE>@sE2V_cbSBN=Ww zb9%+e)8)-Ws@v{s)|HVJPcsv;N<2&BhtNyPtX)N z?<-dZZY9*Il~M&C)LYRwka?5W{I?--HCc03xL;5m7SyAm@?-vj2L?z>c4P`)E7j1q z_aIUEtNH>gPJU*h5cLg3gmL2Zxr3e7`^<8g`i~r8+&m6% z>m-59AFG74Zr<7ClcI zuO=+`f-WfUTM~*;K-sza94Toj1Ym$I=jdtdxS^RsX}wcJcK2;eX1>$_ zDxmKq+wo5&v{NzE_ZzXkk4&61defEb!QGRvtSlEaTn^`p>U1(cj?2f6+q=bn+S!i9i~c;)#Kiq##ve(QvhUPPG0p zeu8NH-P2Nfx-ck&LZ60^=0Wy%qsqe zMbPZ`EkCqE#jD3249St&nM#<(PWa=@Zu730fS4<$`ji`7oRHbb0rxkKCK*}&I#UICCTEf}4XwBsT z*wiQIdmgZC-zKV)YilbL>8M-5&gxF*l~V}1lx!N3{}d$8RI;gdTiYaa(ANAQ^_$g{owH{;Vl*`S5-?cKhrS^3?Cj1N*dX zzs^Xc4V)jf(&I?*?-aX(tvjY25A0#qmp(}F>H+UOIfvFQOSOZH7{cfD7j7`9sjAJe zcCWkQ{@Al0zzkT=*Wi%TkwzZ1l77ln3rW9}`qwHM3Ypia>9!}0osDzR^36X5;>K%j zy#nF!s|=n-QFr~O2Oe6^EckPq9!Jqb3U`c;3-yTx448%%?+!2 z@dK#S6`h3^zra*{wZw|XmV0A5G#s)dc0>QMB~SfR&T?jg$6zF5kNUW; zuca0EuA3fj(U>@lbu~EHH$5*fUY%@=FEGKC7W`Ma8VS~5T zS;1Ma*^Y}sfJtHIcW;jC(=%k#iED@K{&8DKk@K(sg<@dkSWE8eAAiTl)inLB&TUtM zy6eoJa`ckV1|A7e7G$?AYgDB{;A*Sc^=baj;(Vl^D*P10hN z#Av~zvwY2&o)dOwPHA!6L!#D@#l2)i)2l7yKYz83OD^{31~o!bH_J-=X@Ic*M)3*j zypsD`0L)>smf!lj9e>O**r+h5|`* z7po~YOXQ^(avsVDckW$?bF_}Wd3CY>h;nw{)BWZ!YEiKB9lBJc z7m#Lu&oNnB?lebwPRC*ijPAoqQM{@oZe3an{8Y=Zg(+rt%pn;R+d%R%R5; z;#KKEFgRpt_4&m&(Umw4=e`5wy9UF|VDT!S9^dzywdFcKXuV?};waPybUAQtepH7_ zvTXhdV~^;1Z4^srL*`lTzjQMcutTF<`r&5QCMQJ^l!qUsnT0G~DuGyuMq5kpbe+6w zcz;#tvQcdLY=Qj&r$^x=$TQp_KNN|K52GWEYm@NMRq~t{Glq@9^=@NGEFV3wq9T&D z)J-YdM+Ox*cgFx1YBo>y21rAa0-|$n647e=Z$RqxHrY2l@W37CpHCwN?t*Cik_ecA z3p1eebXv!=_`xzu5V8lGMI;ps-yJ1J;VWlepY(M z6T9p!a6QATCav|GL|E=it?T~$bXHD32TFx2wz{Y-Q=V**3P4=6?^0<6lA;=2r_OtHSxIr0mPkkCpcGaU|9!mr7eUw+ zaZnjtnimkXB5iL$qa@ciTs}yFVe^-RS_?VtHzl(-`=4%E%urO^{BT224j7>xUoX$}YLCg+Xty za;j~DK#R>>SPo+NnRArJA5c97yPd^NOs!gA$XP&kFdvFbr2{>ByvWR-xMCcIiv~4I$K}lP^6(#fBlBy zQ+C6G@%GYpj5hUpd>s;w#LB*xZhpfT(Fgh1xStwypXRh%RHIZ5OKK}-NfuV08ub=V z2WmFrQe8ZU-J_wKNZXZGtiwUQf~;L;CriM(MF-8>Qi>FXp}6Aug6^NYH$nK+(MW@X zsA9gf^TOxQo!oqQ{AeWXtME~c|B>rJ_8oStwY9c0a}ar2(jvvLR;gSr?ED9#+gTE+ zv0tf+(mm==@r=BM48IEt_(xVR_eU2Wu7SXskYd`#OPkP0ch~z`CiOerq zFE%z+kSKctS`vv|0v9+)pJ;zb0~#=)d?rhAa#cs>ll!{g8y6jOq^gj#mojMjRR7-N zoFD;z81GGEi5A+FzLhLKhmeF%qUhcMoJ`d0sHDwBTO2O0_oyv)IvbdyqmzS0a42EJ zQB!s6a|!{>dZ-v=V)A^Llq4BNY?F*TWITM&Y1@U+&c1IzYpu1a54irg@6&cE9QMwm zb2K9Aeb5iNuP1-X3n?zfS#x3usowdhsnp;R6`FSG$jf-IVr8{d=CZN0@WPP7$Pp|)4@fx1P1-j0}V z9{KF*Ta3(0B&Gvo2*&AA3l1sXY1`&kR#ML{s>ex;_AXR2%BU#vL9K8AUjw`K>)>Ox z3Z{1;+ADP6B@D&Q@H^&KKvU1-B{LY?pm98QRZ&N&I+TFP0o;nErMN^YZ5pq-lVhq9 zvmyI(3yF6#s=8;)dbBKjaA9~aKyU~DBd zbv9AyW`VZ0P0Q`BitHLz`?BHTDvIwed}+ru~5w9IbW5V*?(U%gB{Ds#Z z-s;u3taMqh@grWTMnRH*`e9M)2esLNK~q3V|JMb6B|ogk^%AdYS^;X{K?K0&n!OYI z97Hi1azs5By)@oN(h^`sZC_&tG3KDYkSnb^Y);8J5xX}$ff!#htF&l+z{3~jUq-f8 z4rF2b3`vdDr9GsQkg1Wgz)dD2-Ovj_mH<2(v2cU>JfZfJnkuhP7*B3uwM%ds);XOiw*8Eb=^(PMSMReh}+q z1+Mk;Dw(q!wz^YwYL>Y!<;)vZBwBtxWlEu1|E<3j6nekRWyD~pOUcVlVxj0@-4tq* zX3`8KOc{&uv38+gWm3_g(NDtW*kHSS2g2~7x!4Dm8~4(-jzeL=QZib2WCS%yoXPTn zIJ?6DHAje50kanUDD_|VC$){_wd8;7&pI9iOLU&pZ1Lw`L@2%;E{0u=3vO)={4aIX z4t0(cK^J|8Q|)Z}FB=_!A#MCO$^?G{G~loLmyPnbSQq<_*k48|dC(g< z1c8(Sr96H5KUC=<08#AFoY<4*A7_#R!6a>s1xyV){+E3PMHIW@#%{*_%Q2P9iN!}K z?Lpe#`VVX>;r|zB>Ys+Ti2e^^+VfYbCDiNuU$!VMqL}DV@KMOWgINAm%G2KL`!Dam}yGKE+GNJtX2Q*HI2Q2%mhA1dYx)QkXgG%5@{T32G%D_gTuR17>83;}p2mJlAT z#0vY$!AScWCLXN@p6gW@kteMLwB4|MHaqASA+1CX?aBc*+l~YjDX11&D~=M)ECH2} zY@a338x2ANAlm|L1A)Zqh-fghma9Am1%QnD8~OJm4to|NiUzO+8%Uxfqhe&BWjv4> xh#;a^$iG{wgP$WL2w)4q`Ko;o(f_9caoE?ZKYPB-%|}9fin3}ll~QKG{{zMOogM%H literal 0 HcmV?d00001 diff --git a/client/index.html b/client/index.html new file mode 100644 index 0000000..8d9561c --- /dev/null +++ b/client/index.html @@ -0,0 +1,77 @@ + + + + + + am real programmer + + + + +

+
+ About + Store +
+
+ Gmail + Images + + +
+
+
+ + + +
+
+ + + + +
+
+ + +
+
+
+ + + diff --git a/client/search.html b/client/search.html new file mode 100644 index 0000000..16b39e5 --- /dev/null +++ b/client/search.html @@ -0,0 +1,119 @@ + + + + + + Search Results - My Search Engine + + + +
+ + + + +
+
+ + + + +
+
+
+
+ + + + diff --git a/client/styles.css b/client/styles.css new file mode 100644 index 0000000..89c3926 --- /dev/null +++ b/client/styles.css @@ -0,0 +1,247 @@ +* { + margin: 0; + padding: 0; + + font-family: 'Roboto', sans-serif; +} + +body { + display: flex; + flex-direction: column; + min-height: 100vh; + /* ensures the body takes up at least the full viewport height */ +} + +a { + all: unset; + text-decoration: none; + /* no underline */ +} + +.top-section { + padding: 1rem; + display: flex; + justify-content: space-between; +} + +.app-icon { + width: 1.5rem; + height: 1.5rem; +} + +.profile-pic { + width: 2rem; + height: 2rem; + border-radius: 100%; +} + +.left-side { + display: flex; + gap: 1.5rem; +} + +.right-side { + display: flex; + gap: 1.5rem; + justify-content: center; + align-items: center; +} + +.left-side a, +.right-side a { + color: #202124; + font-size: 0.8rem; +} + +.middle-section { + flex-grow: 1; + display: flex; + flex-direction: column; + justify-content: center; + align-items: center; + padding: 1rem 0; + gap: 1.2rem; +} + +.search-label { + display: none; +} + +.search-form { + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + gap: 2.5rem; +} +.result-search-form { + flex-direction: column; + align-items: center; + justify-content: center; + gap: 2.5rem; +} + +.search-form-input { + display: flex; + align-items: center; + justify-content: center; + gap: 1rem; + border: 1px solid #dfe1e5; + border-radius: 30px; + padding: 0.3rem 1.5rem; + box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.1); +} + +.search-form input { + width: 27rem; + padding: 0.5rem; + border: none; + outline: none; +} + +.buttons { + display: flex; + gap: 1rem; +} + +.search-form button { + border: 1px solid #f8f9fa; + padding: 0.5rem 1rem; + background-color: #f8f9fa; + font-size: 0.9rem; +} +.search-icon-home { + width: 1rem; + height: 1rem; +} +.search-icon-result { + width: 1.5rem; + height: 1.5rem; +} +.mic, +.camera { + width: 1.5rem; + height: 1.5rem; +} + +.bottom-section { + margin-top: 15rem; + padding: 1rem; + display: flex; + justify-content: space-between; + align-items: center; + background-color: #f2f2f2; + font-size: 0.9em; + padding-left: 2rem; + padding-right: 2rem; +} + +.bottom-left, +.bottom-right { + display: flex; + gap: 1.8rem; +} + +.bottom-middle { + padding-right: 10rem; +} + +.bottom-section a { + color: #70757a; +} + +.search-form button { + background-color: #f8f9fa; + border: 1px solid #f8f9fa; + border-radius: 4px; + color: #3c4043; + font-family: Roboto, arial, sans-serif; + font-size: 14px; + margin: 11px 4px; + padding: 0 16px; + line-height: 27px; + height: 36px; + min-width: 54px; + text-align: center; + cursor: pointer; + user-select: none; +} + +.bottom-section { + display: flex; + justify-content: space-between; + align-items: center; + background-color: #f2f2f2; + padding: 1rem 1.5rem; + margin-top: 15rem; +} + +.bottom-section a { + margin: 0 1rem; +} + +.bottom-middle { + margin-right: 8rem; +} + +.search-result-area { + display: flex; + padding-left: 1rem; + gap: 1rem; +} +.search-logo-home { + width: 20rem; +} +.search-logo-result { + width: 7rem; +} + +#results { + padding-top: 1rem; + display: flex; + flex-direction: column; + gap: 1rem; + padding-left: 2rem; + padding-right: 2rem; +} +.result:hover { + cursor: pointer; +} + +.result-description { + font-size: 0.8rem; + width: 50%; + color: #545454; +} +.result { + margin-bottom: 20px; +} +.result-title { + font-size: 18px; + color: #1a0dab; + text-decoration: none; +} +.result-title:hover { + text-decoration: underline; +} +.result-url { + font-size: 14px; + color: #006621; +} +#pagination { + display: flex; + justify-content: center; + align-items: center; + gap: 1.5rem; + padding: 2rem; + font-size: 1.2rem; +} + +#pagination a { + color: #1a0dab; +} + +#pagination a:hover { + text-decoration: underline; + cursor: pointer; +} diff --git a/search/complete_examples/advanced_pagerank.py b/search/complete_examples/advanced_pagerank.py new file mode 100644 index 0000000..f7128a5 --- /dev/null +++ b/search/complete_examples/advanced_pagerank.py @@ -0,0 +1,239 @@ + +from bs4 import BeautifulSoup +import requests +import time +import random +from queue import Queue +from concurrent.futures import ThreadPoolExecutor +import threading +from urllib.parse import urlparse +import csv +import sys +import os +# Add the root directory to sys.path +# This is to be able to import modules from other directories (indexing and serving) idk why... +# any imports from indexing/serving need to happen under this +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from indexing.advanced_indexing import advanced_index_page +from serving.pagerank import compute_pagerank + + +# Function to check robots.txt for permission to crawl +# If we don't do this, we could get blocked/banned +# since we don't have permission to crawl. +def can_crawl(url): + parsed_url = urlparse(url) + robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" + print(f"Checking robots.txt for: {robots_url}") + time.sleep(random.uniform(1, 3)) + try: + response = requests.get(robots_url, timeout=5) + response.raise_for_status() + disallowed_paths = [] + for line in response.text.splitlines(): + if line.startswith("Disallow"): + parts = line.split() + if len(parts) > 1: + disallowed_paths.append(parts[1]) + for path in disallowed_paths: + if urlparse(url).path.startswith(path): + print(f"Disallowed by robots.txt: {url}") + return False + return True + except requests.RequestException: + print(f"Failed to access robots.txt: {robots_url}") + return False # If we can't access robots.txt, assume we can't crawl (we're being nice here) + +# Function to fetch and parse URL +def crawl(args): + queue = args['queue'] + visited_urls = args['visited_urls'] + crawl_count = args['crawl_count'] + CRAWL_LIMIT = args['CRAWL_LIMIT'] + lock = args['lock'] + index = args['index'] + webpage_info = args['webpage_info'] + webpage_id_counter = args['webpage_id_counter'] + pagerank_graph = args['pagerank_graph'] + stop_crawl = args['stop_crawl'] + + while not stop_crawl.is_set(): + try: + current_url = queue.get(timeout=5) + print("Time to crawl: " + current_url) + except Exception: + break # Exit if no more URLs are available to crawl + + with lock: + if crawl_count[0] >= CRAWL_LIMIT: + queue.queue.clear() # Clear remaining URLs to stop processing + print("Crawl limit reached. Exiting...") + stop_crawl.set() + break + if current_url in visited_urls: + queue.task_done() + continue + visited_urls.add(current_url) + + """ Checks for noindex directive in the page + Comment this out if you don't care about noindex + WARNING: websites could block/ban you if you don't have permission + """ + if not can_crawl(current_url): + queue.task_done() + continue + + time.sleep(random.uniform(2, 5)) + try: + response = requests.get(current_url, timeout=5) + response.raise_for_status() # Check for request errors + content = response.content + + """ Checks for noindex directive in the page + Comment this out if you don't care about noindex + WARNING: websites could block/ban you if you don't have permission + """ + if 'noindex' in content.decode('utf-8').lower(): + print(f"Noindex found, skipping: {current_url}") + queue.task_done() + continue + + + # Parse the fetched content to find new URLs + webpage = BeautifulSoup(content, "html.parser") + + # Index the webpage + indexed_page = advanced_index_page(webpage, current_url) + with lock: + for word in indexed_page["words"]: + if word not in index: + index[word] = set() + index[word].add(webpage_id_counter[0]) + webpage_info[webpage_id_counter[0]] = indexed_page + webpage_id_counter[0] += 1 + + hyperlinks = webpage.select("a[href]") + #NEW: Add hyperlink connections for pagerank + new_urls, hyperlink_connections = parse_links(hyperlinks, current_url) + pagerank_graph[current_url] = hyperlink_connections + + with lock: + for new_url in new_urls: + if new_url not in visited_urls: + queue.put(new_url) + crawl_count[0] += 1 + + except requests.RequestException as e: + print(f"Failed to fetch {current_url}: {e}") + finally: + queue.task_done() + +# Function to parse links from HTML content +def parse_links(hyperlinks, current_url): + urls = [] + #NEW: Add hyperlink connections for pagerank + hyperlink_connections = set() + for hyperlink in hyperlinks: + url = hyperlink["href"] + + # Format the URL into a proper URL + if url.startswith("#"): + continue # Skip same-page anchors + if url.startswith("//"): + url = "https:" + url # Add scheme to protocol-relative URLs + elif url.startswith("/"): + # Construct full URL for relative links + base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) + url = base_url + url + elif not url.startswith("http"): + continue # Skip non-HTTP links + url = url.split("#")[0] # Remove anchor + + hyperlink_connections.add(url) + urls.append(url) + return urls, hyperlink_connections + +# Main crawling function +def sloth_bot(): + # Start with the initial pages to crawl + starting_urls = [ + "https://www.wikipedia.org/wiki/Google", + "https://www.bbc.com/news/world", + "https://news.ycombinator.com/", + ] + + urls_to_crawl = Queue() + for seed_url in starting_urls: + urls_to_crawl.put(seed_url) + + visited_urls = set() # URL tracking + CRAWL_LIMIT = 20 # Set crawl limit + crawl_count = [0] # Shared counter + lock = threading.Lock() # Thread safety lock + index = {} + webpage_info = {} + #NEW: pagerank graph for pagerank. + # This will be used to store the connections between hyperlinks + pagerank_graph = {} + webpage_id_counter = [0] + stop_crawl = threading.Event() + + # Start concurrent crawling with ThreadPoolExecutor + #Concurrency = speed + #Threads go BRRRRR + #Increase this if you want more threads, but be careful with these. + NUM_WORKERS = 100 + #Setting up arguments for the crawl function + args = { + 'queue': urls_to_crawl, + 'visited_urls': visited_urls, + 'crawl_count': crawl_count, + 'CRAWL_LIMIT': CRAWL_LIMIT, + 'lock': lock, + 'index': index, + 'webpage_info': webpage_info, + 'webpage_id_counter': webpage_id_counter, + 'pagerank_graph': pagerank_graph, + 'stop_crawl': stop_crawl + } + + with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: + for _ in range(NUM_WORKERS): + executor.submit(crawl, args) + + print("All URLs have been crawled") + + #NEW: Computes pagerank + pagerank_scores = compute_pagerank(pagerank_graph) + + + """ This part is for saving the data to CSV files. + However, if you don't want to save the data, you can remove/comment out this part. + If you want to use a database, you can replace this part with a database connection. + """ + with open('advanced_pagerank_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['word', 'doc_ids'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for word, doc_ids in index.items(): + writer.writerow({'word': word, 'doc_ids': list(doc_ids)}) + + with open('advanced_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['doc_id', 'url', 'title', 'description', 'pagerank'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for doc_id, info in webpage_info.items(): + writer.writerow({ + 'doc_id': doc_id, + 'url': info['url'], + 'title': info['title'], + 'description': info['description'], + 'pagerank': pagerank_scores.get(info['url'], 0) + }) + +# Entry point for the script +def main(): + sloth_bot() + +if __name__ == "__main__": + main() diff --git a/search/complete_examples/simple_pagerank.py b/search/complete_examples/simple_pagerank.py new file mode 100644 index 0000000..af42cdb --- /dev/null +++ b/search/complete_examples/simple_pagerank.py @@ -0,0 +1,110 @@ +from bs4 import BeautifulSoup +import requests +import time +import random +import csv +import sys +import os +# Add the root directory to sys.path +# This is to be able to import modules from other directories (indexing and serving) idk why... +# any imports from indexing/serving need to happen under this +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from indexing.simple_indexing import simple_index_page +from serving.pagerank import compute_pagerank + +def sloth_bot(): + # Our list of URLs to crawl + urls = ["https://en.wikipedia.org/wiki/Google"] + visited_urls = set() + + # Create the index and graph + index = {} # URL -> page contents + pagerank_graph = {} # URL -> set of URLs it links to + CRAWL_LIMIT = 5 + crawl_count = 0 + + # Loops through the list of URLs + while urls and crawl_count < CRAWL_LIMIT: + # Grab the next URL + current_url = urls.pop() + if current_url in visited_urls: + continue + print("Time to crawl: " + current_url) + time.sleep(random.uniform(1, 2)) + try: + response = requests.get(current_url) + response.raise_for_status() + except requests.RequestException as e: + print(f"Failed to retrieve {current_url}: {e}") + continue + + # Parse the content of the page + webpage = BeautifulSoup(response.content, "html.parser") + + # Add the page to the index + indexed_page = simple_index_page(webpage, current_url) + index[current_url] = indexed_page + visited_urls.add(current_url) + + # Grab the links from the page + hyperlinks = webpage.select("a[href]") + #This is where we store our connected pages + hyperlink_connections = set() + for hyperlink in hyperlinks: + url = hyperlink["href"] + # Format the URL into a proper URL + if url.startswith("#"): + continue + if url.startswith("//"): + url = "https:" + url + elif url.startswith("/"): + base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) + url = base_url + url + elif not url.startswith("http"): + continue + url = url.split('#')[0] + #Add to the link connection + hyperlink_connections.add(url) + # If we haven't visited this URL yet, add it to our list + if url not in visited_urls: + urls.append(url) + + # Update the page's outgoing links + index[current_url]['hyperlink_connections'] = hyperlink_connections + pagerank_graph[current_url] = hyperlink_connections + + crawl_count += 1 + print(f"Crawled count: {crawl_count}, index size: {len(index)}, URLs left: {len(urls)}") + + # Compute PageRank + pagerank_scores = compute_pagerank(pagerank_graph) + + """ This part is for saving the data to CSV files. + However, if you don't want to save the data, you can remove/comment out this part. + If you want to use a database, you can replace this part with a database connection. + """ + + with open('simple_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ["url", "title", "description", "pagerank", "words"] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for url, info in index.items(): + writer.writerow({ + 'url': url, + 'title': info['title'], + 'description': info['description'], + 'pagerank': pagerank_scores.get(url, 0), + 'words': ', '.join(info['words']) + }) + + + +def main(): + # Start the crawling process + sloth_bot() + +if __name__ == "__main__": + main() + + + diff --git a/search/crawling/advanced_crawler.py b/search/crawling/advanced_crawler.py new file mode 100644 index 0000000..ff217a0 --- /dev/null +++ b/search/crawling/advanced_crawler.py @@ -0,0 +1,224 @@ +from bs4 import BeautifulSoup +import requests +import time +import random +from queue import Queue +from concurrent.futures import ThreadPoolExecutor +import threading +from urllib.parse import urlparse +import csv +from indexing.advanced_indexing import index_page +import sys +import os +# Add the root directory to sys.path +# This is to be able to import modules from other directories (indexing and serving) idk why... +# any imports from indexing/serving need to happen under this +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +# Function to check robots.txt for permission to crawl +# If we don't do this, we could get blocked/banned +# since we don't have permission to crawl. +def can_crawl(url): + parsed_url = urlparse(url) + robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" + print(f"Checking robots.txt for: {robots_url}") + time.sleep(random.uniform(1, 3)) + try: + response = requests.get(robots_url, timeout=5) + response.raise_for_status() + disallowed_paths = [] + for line in response.text.splitlines(): + if line.startswith("Disallow"): + parts = line.split() + if len(parts) > 1: + disallowed_paths.append(parts[1]) + for path in disallowed_paths: + if urlparse(url).path.startswith(path): + print(f"Disallowed by robots.txt: {url}") + return False + return True + except requests.RequestException: + print(f"Failed to access robots.txt: {robots_url}") + return False # If we can't access robots.txt, assume we can't crawl (we're being nice here) + +# Function to fetch and parse URL +def crawl(args): + queue = args['queue'] + visited_urls = args['visited_urls'] + crawl_count = args['crawl_count'] + CRAWL_LIMIT = args['CRAWL_LIMIT'] + lock = args['lock'] + index = args['index'] + webpage_info = args['webpage_info'] + webpage_id_counter = args['webpage_id_counter'] + stop_crawl = args['stop_crawl'] + + while not stop_crawl.is_set(): + try: + current_url = queue.get(timeout=5) + print("Time to crawl: " + current_url) + except Exception: + break # Exit if no more URLs are available to crawl + + with lock: + if crawl_count[0] >= CRAWL_LIMIT: + queue.queue.clear() # Clear remaining URLs to stop processing + print("Crawl limit reached. Exiting...") + stop_crawl.set() + break + if current_url in visited_urls: + queue.task_done() + continue + visited_urls.add(current_url) + + """ Checks for noindex directive in the page + Comment this out if you don't care about noindex + WARNING: websites could block/ban you if you don't have permission + """ + # if not can_crawl(current_url): + # queue.task_done() + # continue + + time.sleep(random.uniform(2, 5)) + try: + response = requests.get(current_url, timeout=5) + response.raise_for_status() # Check for request errors + content = response.content + + """ Checks for noindex directive in the page + Comment this out if you don't care about noindex + WARNING: websites could block/ban you if you don't have permission + """ + # if 'noindex' in content.decode('utf-8').lower(): + # print(f"Noindex found, skipping: {current_url}") + # queue.task_done() + # continue + + + # Parse the fetched content to find new URLs + webpage = BeautifulSoup(content, "html.parser") + + # Index the webpage + indexed_page = index_page(webpage, current_url) + with lock: + for word in indexed_page["words"]: + if word not in index: + index[word] = set() + index[word].add(webpage_id_counter[0]) + webpage_info[webpage_id_counter[0]] = indexed_page + webpage_id_counter[0] += 1 + + hyperlinks = webpage.select("a[href]") + new_urls = parse_links(hyperlinks, current_url) + + with lock: + for new_url in new_urls: + if new_url not in visited_urls: + queue.put(new_url) + crawl_count[0] += 1 + + except requests.RequestException as e: + print(f"Failed to fetch {current_url}: {e}") + finally: + queue.task_done() + +# Function to parse links from HTML content +def parse_links(hyperlinks, current_url): + urls = [] + for hyperlink in hyperlinks: + url = hyperlink["href"] + + # Format the URL into a proper URL + if url.startswith("#"): + continue # Skip same-page anchors + if url.startswith("//"): + url = "https:" + url # Add scheme to protocol-relative URLs + elif url.startswith("/"): + # Construct full URL for relative links + base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) + url = base_url + url + elif not url.startswith("http"): + continue # Skip non-HTTP links + url = url.split("#")[0] # Remove anchor + urls.append(url) + return urls + +# Main crawling function +def sloth_bot(): + # Start with the initial pages to crawl + starting_urls = [ + "https://www.wikipedia.org/wiki/Google", + "https://www.bbc.com/news/world", + "https://news.ycombinator.com/", + ] + + urls_to_crawl = Queue() + for seed_url in starting_urls: + urls_to_crawl.put(seed_url) + + visited_urls = set() # URL tracking + CRAWL_LIMIT = 20 # Set crawl limit + crawl_count = [0] # Shared counter + lock = threading.Lock() # Thread safety lock + index = {} + webpage_info = {} + webpage_id_counter = [0] + stop_crawl = threading.Event() + + # Start concurrent crawling with ThreadPoolExecutor + #Concurrency = speed + #Threads go BRRRRR + #Increase this if you want more threads, but be careful with these. + NUM_WORKERS = 100 + #Setting up arguments for the crawl function + args = { + 'queue': urls_to_crawl, + 'visited_urls': visited_urls, + 'crawl_count': crawl_count, + 'CRAWL_LIMIT': CRAWL_LIMIT, + 'lock': lock, + 'index': index, + 'webpage_info': webpage_info, + 'webpage_id_counter': webpage_id_counter, + 'stop_crawl': stop_crawl + } + + with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: + for _ in range(NUM_WORKERS): + executor.submit(crawl, args) + + print("All URLs have been crawled") + + + """ This part is for saving the data to CSV files. + However, if you don't want to save the data, you can remove/comment out this part. + If you want to use a database, you can replace this part with a database connection. + """ + with open('advanced_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['word', 'doc_ids'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for word, doc_ids in index.items(): + writer.writerow({'word': word, 'doc_ids': list(doc_ids)}) + + with open('advanced_doc_info.csv', 'w', newline='', encoding='utf-8') as csvfile: + fieldnames = ['doc_id', 'url', 'title', 'description'] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for doc_id, info in webpage_info.items(): + writer.writerow({ + 'doc_id': doc_id, + 'url': info['url'], + 'title': info['title'], + 'description': info['description'] + }) + +def main(): + # Start the crawling process + sloth_bot() + +if __name__ == "__main__": + main() + + + diff --git a/search/crawling/simple_crawler.py b/search/crawling/simple_crawler.py new file mode 100644 index 0000000..7c80ce7 --- /dev/null +++ b/search/crawling/simple_crawler.py @@ -0,0 +1,65 @@ +from bs4 import BeautifulSoup +import requests +import time +import random + +def sloth_bot(): + # our list of URLs to crawl + urls = ["https://en.wikipedia.org/wiki/Google"] + visited_urls = set() + #timer to see how long it takes to crawl + start = time.time() + #Loops through the list of urls + CRAWL_LIMIT = 15 + current_crawl_count = 0 + + while urls and current_crawl_count < CRAWL_LIMIT: + # grabs the next url + current_url = urls.pop(0) + print("time to crawl: " + current_url) + time.sleep(random.uniform(1, 3)) + try: + response = requests.get(current_url) + response.raise_for_status() + except requests.RequestException as e: + print(f"Failed to retrieve {current_url}: {e}") + continue + + # grabbing the content of the page + webpage = BeautifulSoup(response.content, "html.parser") + + # grabbing the links from the page + hyperlinks = webpage.select("a[href]") + # looping through the links and adding them to our list of urls + for hyperlink in hyperlinks: + url = hyperlink["href"] + #Formats the url into a proper url (don't worry about this) + if url.startswith("#"): + continue + if url.startswith("//"): + url = "https:" + url + elif url.startswith("/"): + base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) + url = base_url + url + elif not url.startswith("http"): + continue + # + url = url.split('#')[0] + + #if we haven't visited this url yet, add it to our list + if url not in visited_urls: + urls.append(url) + visited_urls.add(url) + + current_crawl_count += 1 + + +def main(): + # Start the crawling process + sloth_bot() + +if __name__ == "__main__": + main() + + + diff --git a/search/indexing/__init__.py b/search/indexing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/search/indexing/advanced_indexing.py b/search/indexing/advanced_indexing.py new file mode 100644 index 0000000..5e4e98f --- /dev/null +++ b/search/indexing/advanced_indexing.py @@ -0,0 +1,73 @@ +import nltk +import ssl +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +from nltk.tokenize import word_tokenize + +try: + _create_unverified_https_context = ssl._create_unverified_context +except AttributeError: + pass +else: + ssl._create_default_https_context = _create_unverified_https_context +nltk.download('stopwords') +nltk.download('punkt_tab') +try: + _create_unverified_https_context = ssl._create_unverified_context +except AttributeError: + pass +else: + ssl._create_default_https_context = _create_unverified_https_context + +# Download NLTK data only if not already downloaded +def download_nltk_resources(): + try: + stopwords.words('english') + except LookupError: + nltk.download('stopwords') + try: + word_tokenize('test') + except LookupError: + nltk.download('punkt') +#Function that indexes the webpage +def advanced_index_page(webpage, webpage_url): + #Download NLTK data only if not already downloaded + download_nltk_resources() + + # Initialize NLTK components + stop_words = set(stopwords.words('english')) + ps = PorterStemmer() + #Collect title and description + title_tag = webpage.find('title') + title = title_tag.get_text().strip() if title_tag else 'No Title' + + #Collect description + description = '' + meta_description = webpage.find('meta', attrs={'name': 'description'}) + if meta_description and 'content' in meta_description.attrs: + description = meta_description['content'] + else: + text_content = webpage.get_text(separator=" ", strip=True) + description = text_content[:200] + "..." if len(text_content) > 200 else text_content + + + # Grab ALL the words in the page. + text_content = webpage.get_text(separator=' ', strip=True) + #Splitting them into the individual words + tokens = word_tokenize(text_content.lower()) + #Big brain techniques 2 and 3 + #Stemming the words and removing stop words. + filtered_words = [ + ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words + ] + + #Add the information to the index + indexed_page = { + "url": webpage_url, + "title": title, + "description": description, + "words": filtered_words + } + #If you want to print the results + #print(f"Indexed: {webpage_url}. \n Here's the info: \n title: {title} \n description: {description} \n number of words: {len(filtered_words)} \n") + return indexed_page diff --git a/search/indexing/simple_indexing.py b/search/indexing/simple_indexing.py new file mode 100644 index 0000000..13845fe --- /dev/null +++ b/search/indexing/simple_indexing.py @@ -0,0 +1,34 @@ +import re + +def simple_index_page(webpage, webpage_url): + + #Collect title and description + title_tag = webpage.find('title') + title = title_tag.get_text().strip() if title_tag else 'No Title' + + #Collect description + description = '' + meta_description = webpage.find('meta', attrs={'name': 'description'}) + if meta_description and 'content' in meta_description.attrs: + description = meta_description['content'] + else: + text_content = webpage.get_text(separator=" ", strip=True) + description = text_content[:200] + "..." if len(text_content) > 200 else text_content + + #Grab ALL the words in the page + #regex disgusting... + words = re.findall(r'\b\w+\b', webpage.get_text(separator=" ", strip=True).lower()) + + #Double check and filter out any numbers, symbols, etc. + #WE ONLY WANT WORDS + words = [word for word in words if word.isalpha()] + + #Add the information to the index + indexed_page = { + "url": webpage_url, + "title": title, + "description": description, + "words": words + } + print(f"Indexed: {webpage_url}. \n Here's the info: \n title: {title} \n description: {description} \n number of words: {len(words)} \n") + return indexed_page \ No newline at end of file diff --git a/search/serving/pagerank.py b/search/serving/pagerank.py new file mode 100644 index 0000000..16c1deb --- /dev/null +++ b/search/serving/pagerank.py @@ -0,0 +1,34 @@ + + +def compute_pagerank(graph, damping_factor=0.85, max_iterations=100, tol=1.0e-6): + # Build the set of all URLs + all_nodes = set(graph.keys()) + for links in graph.values(): + all_nodes.update(links) + num_nodes = len(all_nodes) + # Initialize PageRank scores + pagerank = {url: 1.0 / num_nodes for url in all_nodes} + # Identify dangling nodes (nodes with no outgoing links) + dangling_nodes = [url for url in all_nodes if url not in graph or len(graph[url]) == 0] + # Iterative computation + for iteration in range(max_iterations): + new_pagerank = {} + # Sum of PageRank scores from dangling nodes + dangling_sum = damping_factor * sum(pagerank[node] for node in dangling_nodes) / num_nodes + for url in all_nodes: + rank = (1.0 - damping_factor) / num_nodes + rank += dangling_sum + # Sum contributions from incoming links + for node in graph: + if url in graph[node]: + out_degree = len(graph[node]) + rank += damping_factor * pagerank[node] / out_degree + new_pagerank[url] = rank + # Check for convergence + error = sum(abs(new_pagerank[url] - pagerank[url]) for url in all_nodes) + if error < tol: + break + pagerank = new_pagerank + for url in all_nodes: + pagerank[url] = round(pagerank[url], 6) + return pagerank diff --git a/server/google_search_api.py b/server/google_search_api.py new file mode 100644 index 0000000..3542e09 --- /dev/null +++ b/server/google_search_api.py @@ -0,0 +1,136 @@ +from flask import Flask, request, jsonify +import csv +import nltk +from nltk.corpus import stopwords +from nltk.stem import PorterStemmer +from nltk.tokenize import word_tokenize +import ssl +from flask_cors import CORS +app = Flask(__name__) + + +CORS(app) + +# NLTK setup (handles SSL certificate issues) +try: + _create_unverified_https_context = ssl._create_unverified_context +except AttributeError: + pass +else: + ssl._create_default_https_context = _create_unverified_https_context + +# Download NLTK data only if not already downloaded +def download_nltk_resources(): + try: + stopwords.words('english') + except LookupError: + nltk.download('stopwords') + try: + word_tokenize('test') + except LookupError: + nltk.download('punkt') + +# Initialize NLTK components +download_nltk_resources() +stop_words = set(stopwords.words('english')) +ps = PorterStemmer() + + +def load_inverted_index(file_path): + inverted_index = {} + with open(file_path, 'r', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + word = row['word'] + doc_ids_str = row['doc_ids'].strip("[]") # Remove brackets + doc_ids_list = doc_ids_str.split(', ') if doc_ids_str else [] + doc_ids = set(int(doc_id) for doc_id in doc_ids_list) + inverted_index[word] = doc_ids + return inverted_index + +def load_document_info(file_path): + document_info = {} + with open(file_path, 'r', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + doc_id = int(row['doc_id']) + document_info[doc_id] = { + 'url': row['url'], + 'title': row['title'], + 'description': row['description'], + 'pagerank': float(row['pagerank']) + } + return document_info + +def parse_query(query): + # Tokenize the query + tokens = word_tokenize(query.lower()) + # Remove non-alphabetic tokens and stop words, then stem the words + query_words = [ + ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words + ] + return query_words + +def search(query, inverted_index, document_info, num_results=10, page=1): + query_words = parse_query(query) + if not query_words: + return [] + # Find documents that contain any of the query words + matched_doc_ids = set() + for word in query_words: + if word in inverted_index: + matched_doc_ids.update(inverted_index[word]) + if not matched_doc_ids: + return [] + # Retrieve documents and their PageRank scores + results = [] + for doc_id in matched_doc_ids: + info = document_info[doc_id] + results.append({ + 'doc_id': doc_id, + 'url': info['url'], + 'title': info['title'], + 'description': info['description'], + 'pagerank': info['pagerank'] + }) + # Sort documents by PageRank score + sorted_results = sorted(results, key=lambda x: x['pagerank'], reverse=True) + # Pagination + start = (page - 1) * num_results + end = start + num_results + paginated_results = sorted_results[start:end] + return paginated_results + +# Load the inverted index and document info +# If you are using a different file, replace the path with the path to your file +#If you're using a database, replace this with the code to connect to your database +try: + inverted_index = load_inverted_index('../search/complete_examples/advanced_pagerank_inverted_index.csv') + document_info = load_document_info('../search/complete_examples/advanced_pagerank.csv') +except FileNotFoundError: + try: + inverted_index = load_inverted_index("../advanced_pagerank_inverted_index.csv") + document_info = load_document_info("../advanced_pagerank.csv") + except FileNotFoundError: + print("Error: Files not found, run the advanced_pagerank.py file first") + print("Exiting...") + exit() + + +@app.route('/search') +def search_api(): + query = request.args.get('q', '') + num_results = int(request.args.get('num_results', 10)) + page = int(request.args.get('page', 1)) + if not query: + return jsonify({'error': 'No query provided'}), 400 + results = search(query, inverted_index, document_info, num_results=num_results, page=page) + return jsonify({ + 'query': query, + 'page': page, + 'num_results': num_results, + 'results': results + }) + +if __name__ == '__main__': + app.run(debug=True) \ No newline at end of file