Amazing-Python-Scripts
936 строк · 46.1 Кб
1{
2"nbformat": 4,
3"nbformat_minor": 0,
4"metadata": {
5"kernelspec": {
6"display_name": "Python 3",
7"language": "python",
8"name": "python3"
9},
10"language_info": {
11"codemirror_mode": {
12"name": "ipython",
13"version": 3
14},
15"file_extension": ".py",
16"mimetype": "text/x-python",
17"name": "python",
18"nbconvert_exporter": "python",
19"pygments_lexer": "ipython3",
20"version": "3.8.3"
21},
22"colab": {
23"name": "WebScrapping and PreProcessing.ipynb",
24"provenance": [],
25"toc_visible": true,
26"include_colab_link": true
27}
28},
29"cells": [
30{
31"cell_type": "markdown",
32"metadata": {
33"id": "view-in-github",
34"colab_type": "text"
35},
36"source": [
37"<a href=\"https://colab.research.google.com/github/shubhigupta991/Reddit-Flair-Detection/blob/main/scripts/WebScrapping%20and%20PreProcessing.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
38]
39},
40{
41"cell_type": "markdown",
42"metadata": {
43"id": "6dtwbrrXdGcx"
44},
45"source": [
46"# Collecting the data from reddit\n",
47"\n",
48"We are collecting the data with the help of PRAW\n",
49"\n",
50"PRAW stands for Python Reddit API Wrapper."
51]
52},
53{
54"cell_type": "code",
55"metadata": {
56"colab": {
57"base_uri": "https://localhost:8080/"
58},
59"id": "xjMXAytddIoJ",
60"outputId": "c21b125c-d1e9-48ac-ba70-d99dc818298d"
61},
62"source": [
63"!pip install praw"
64],
65"execution_count": 1,
66"outputs": [
67{
68"output_type": "stream",
69"text": [
70"Collecting praw\n",
71"\u001b[?25l Downloading https://files.pythonhosted.org/packages/2c/15/4bcc44271afce0316c73cd2ed35f951f1363a07d4d5d5440ae5eb2baad78/praw-7.1.0-py3-none-any.whl (152kB)\n",
72"\r\u001b[K |██▏ | 10kB 18.0MB/s eta 0:00:01\r\u001b[K |████▎ | 20kB 18.7MB/s eta 0:00:01\r\u001b[K |██████▌ | 30kB 15.1MB/s eta 0:00:01\r\u001b[K |████████▋ | 40kB 9.1MB/s eta 0:00:01\r\u001b[K |██████████▊ | 51kB 10.8MB/s eta 0:00:01\r\u001b[K |█████████████ | 61kB 12.3MB/s eta 0:00:01\r\u001b[K |███████████████ | 71kB 11.5MB/s eta 0:00:01\r\u001b[K |█████████████████▎ | 81kB 12.3MB/s eta 0:00:01\r\u001b[K |███████████████████▍ | 92kB 12.1MB/s eta 0:00:01\r\u001b[K |█████████████████████▌ | 102kB 10.9MB/s eta 0:00:01\r\u001b[K |███████████████████████▊ | 112kB 10.9MB/s eta 0:00:01\r\u001b[K |█████████████████████████▉ | 122kB 10.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████ | 133kB 10.9MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▏ | 143kB 10.9MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 153kB 10.9MB/s \n",
73"\u001b[?25hCollecting websocket-client>=0.54.0\n",
74"\u001b[?25l Downloading https://files.pythonhosted.org/packages/4c/5f/f61b420143ed1c8dc69f9eaec5ff1ac36109d52c80de49d66e0c36c3dfdf/websocket_client-0.57.0-py2.py3-none-any.whl (200kB)\n",
75"\u001b[K |████████████████████████████████| 204kB 19.9MB/s \n",
76"\u001b[?25hCollecting update-checker>=0.17\n",
77" Downloading https://files.pythonhosted.org/packages/0c/ba/8dd7fa5f0b1c6a8ac62f8f57f7e794160c1f86f31c6d0fb00f582372a3e4/update_checker-0.18.0-py3-none-any.whl\n",
78"Collecting prawcore<2.0,>=1.3.0\n",
79" Downloading https://files.pythonhosted.org/packages/1d/40/b741437ce4c7b64f928513817b29c0a615efb66ab5e5e01f66fe92d2d95b/prawcore-1.5.0-py3-none-any.whl\n",
80"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from websocket-client>=0.54.0->praw) (1.15.0)\n",
81"Requirement already satisfied: requests>=2.3.0 in /usr/local/lib/python3.6/dist-packages (from update-checker>=0.17->praw) (2.23.0)\n",
82"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.17->praw) (2.10)\n",
83"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.17->praw) (3.0.4)\n",
84"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.17->praw) (1.24.3)\n",
85"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.17->praw) (2020.11.8)\n",
86"Installing collected packages: websocket-client, update-checker, prawcore, praw\n",
87"Successfully installed praw-7.1.0 prawcore-1.5.0 update-checker-0.18.0 websocket-client-0.57.0\n"
88],
89"name": "stdout"
90}
91]
92},
93{
94"cell_type": "code",
95"metadata": {
96"colab": {
97"base_uri": "https://localhost:8080/"
98},
99"id": "dGSFaRr4PyEN",
100"outputId": "fa179cad-ea22-4ae8-c897-6832de9fd044"
101},
102"source": [
103"import praw\n",
104"import pandas as pd\n",
105"import numpy as np \n",
106"import re\n",
107"import nltk\n",
108"from nltk.corpus import stopwords\n",
109"import datetime as dt\n",
110"nltk.download('all')\n",
111"from bs4 import BeautifulSoup"
112],
113"execution_count": 2,
114"outputs": [
115{
116"output_type": "stream",
117"text": [
118"[nltk_data] Downloading collection 'all'\n",
119"[nltk_data] | \n",
120"[nltk_data] | Downloading package abc to /root/nltk_data...\n",
121"[nltk_data] | Unzipping corpora/abc.zip.\n",
122"[nltk_data] | Downloading package alpino to /root/nltk_data...\n",
123"[nltk_data] | Unzipping corpora/alpino.zip.\n",
124"[nltk_data] | Downloading package biocreative_ppi to\n",
125"[nltk_data] | /root/nltk_data...\n",
126"[nltk_data] | Unzipping corpora/biocreative_ppi.zip.\n",
127"[nltk_data] | Downloading package brown to /root/nltk_data...\n",
128"[nltk_data] | Unzipping corpora/brown.zip.\n",
129"[nltk_data] | Downloading package brown_tei to /root/nltk_data...\n",
130"[nltk_data] | Unzipping corpora/brown_tei.zip.\n",
131"[nltk_data] | Downloading package cess_cat to /root/nltk_data...\n",
132"[nltk_data] | Unzipping corpora/cess_cat.zip.\n",
133"[nltk_data] | Downloading package cess_esp to /root/nltk_data...\n",
134"[nltk_data] | Unzipping corpora/cess_esp.zip.\n",
135"[nltk_data] | Downloading package chat80 to /root/nltk_data...\n",
136"[nltk_data] | Unzipping corpora/chat80.zip.\n",
137"[nltk_data] | Downloading package city_database to\n",
138"[nltk_data] | /root/nltk_data...\n",
139"[nltk_data] | Unzipping corpora/city_database.zip.\n",
140"[nltk_data] | Downloading package cmudict to /root/nltk_data...\n",
141"[nltk_data] | Unzipping corpora/cmudict.zip.\n",
142"[nltk_data] | Downloading package comparative_sentences to\n",
143"[nltk_data] | /root/nltk_data...\n",
144"[nltk_data] | Unzipping corpora/comparative_sentences.zip.\n",
145"[nltk_data] | Downloading package comtrans to /root/nltk_data...\n",
146"[nltk_data] | Downloading package conll2000 to /root/nltk_data...\n",
147"[nltk_data] | Unzipping corpora/conll2000.zip.\n",
148"[nltk_data] | Downloading package conll2002 to /root/nltk_data...\n",
149"[nltk_data] | Unzipping corpora/conll2002.zip.\n",
150"[nltk_data] | Downloading package conll2007 to /root/nltk_data...\n",
151"[nltk_data] | Downloading package crubadan to /root/nltk_data...\n",
152"[nltk_data] | Unzipping corpora/crubadan.zip.\n",
153"[nltk_data] | Downloading package dependency_treebank to\n",
154"[nltk_data] | /root/nltk_data...\n",
155"[nltk_data] | Unzipping corpora/dependency_treebank.zip.\n",
156"[nltk_data] | Downloading package dolch to /root/nltk_data...\n",
157"[nltk_data] | Unzipping corpora/dolch.zip.\n",
158"[nltk_data] | Downloading package europarl_raw to\n",
159"[nltk_data] | /root/nltk_data...\n",
160"[nltk_data] | Unzipping corpora/europarl_raw.zip.\n",
161"[nltk_data] | Downloading package floresta to /root/nltk_data...\n",
162"[nltk_data] | Unzipping corpora/floresta.zip.\n",
163"[nltk_data] | Downloading package framenet_v15 to\n",
164"[nltk_data] | /root/nltk_data...\n",
165"[nltk_data] | Unzipping corpora/framenet_v15.zip.\n",
166"[nltk_data] | Downloading package framenet_v17 to\n",
167"[nltk_data] | /root/nltk_data...\n",
168"[nltk_data] | Unzipping corpora/framenet_v17.zip.\n",
169"[nltk_data] | Downloading package gazetteers to /root/nltk_data...\n",
170"[nltk_data] | Unzipping corpora/gazetteers.zip.\n",
171"[nltk_data] | Downloading package genesis to /root/nltk_data...\n",
172"[nltk_data] | Unzipping corpora/genesis.zip.\n",
173"[nltk_data] | Downloading package gutenberg to /root/nltk_data...\n",
174"[nltk_data] | Unzipping corpora/gutenberg.zip.\n",
175"[nltk_data] | Downloading package ieer to /root/nltk_data...\n",
176"[nltk_data] | Unzipping corpora/ieer.zip.\n",
177"[nltk_data] | Downloading package inaugural to /root/nltk_data...\n",
178"[nltk_data] | Unzipping corpora/inaugural.zip.\n",
179"[nltk_data] | Downloading package indian to /root/nltk_data...\n",
180"[nltk_data] | Unzipping corpora/indian.zip.\n",
181"[nltk_data] | Downloading package jeita to /root/nltk_data...\n",
182"[nltk_data] | Downloading package kimmo to /root/nltk_data...\n",
183"[nltk_data] | Unzipping corpora/kimmo.zip.\n",
184"[nltk_data] | Downloading package knbc to /root/nltk_data...\n",
185"[nltk_data] | Downloading package lin_thesaurus to\n",
186"[nltk_data] | /root/nltk_data...\n",
187"[nltk_data] | Unzipping corpora/lin_thesaurus.zip.\n",
188"[nltk_data] | Downloading package mac_morpho to /root/nltk_data...\n",
189"[nltk_data] | Unzipping corpora/mac_morpho.zip.\n",
190"[nltk_data] | Downloading package machado to /root/nltk_data...\n",
191"[nltk_data] | Downloading package masc_tagged to /root/nltk_data...\n",
192"[nltk_data] | Downloading package moses_sample to\n",
193"[nltk_data] | /root/nltk_data...\n",
194"[nltk_data] | Unzipping models/moses_sample.zip.\n",
195"[nltk_data] | Downloading package movie_reviews to\n",
196"[nltk_data] | /root/nltk_data...\n",
197"[nltk_data] | Unzipping corpora/movie_reviews.zip.\n",
198"[nltk_data] | Downloading package names to /root/nltk_data...\n",
199"[nltk_data] | Unzipping corpora/names.zip.\n",
200"[nltk_data] | Downloading package nombank.1.0 to /root/nltk_data...\n",
201"[nltk_data] | Downloading package nps_chat to /root/nltk_data...\n",
202"[nltk_data] | Unzipping corpora/nps_chat.zip.\n",
203"[nltk_data] | Downloading package omw to /root/nltk_data...\n",
204"[nltk_data] | Unzipping corpora/omw.zip.\n",
205"[nltk_data] | Downloading package opinion_lexicon to\n",
206"[nltk_data] | /root/nltk_data...\n",
207"[nltk_data] | Unzipping corpora/opinion_lexicon.zip.\n",
208"[nltk_data] | Downloading package paradigms to /root/nltk_data...\n",
209"[nltk_data] | Unzipping corpora/paradigms.zip.\n",
210"[nltk_data] | Downloading package pil to /root/nltk_data...\n",
211"[nltk_data] | Unzipping corpora/pil.zip.\n",
212"[nltk_data] | Downloading package pl196x to /root/nltk_data...\n",
213"[nltk_data] | Unzipping corpora/pl196x.zip.\n",
214"[nltk_data] | Downloading package ppattach to /root/nltk_data...\n",
215"[nltk_data] | Unzipping corpora/ppattach.zip.\n",
216"[nltk_data] | Downloading package problem_reports to\n",
217"[nltk_data] | /root/nltk_data...\n",
218"[nltk_data] | Unzipping corpora/problem_reports.zip.\n",
219"[nltk_data] | Downloading package propbank to /root/nltk_data...\n",
220"[nltk_data] | Downloading package ptb to /root/nltk_data...\n",
221"[nltk_data] | Unzipping corpora/ptb.zip.\n",
222"[nltk_data] | Downloading package product_reviews_1 to\n",
223"[nltk_data] | /root/nltk_data...\n",
224"[nltk_data] | Unzipping corpora/product_reviews_1.zip.\n",
225"[nltk_data] | Downloading package product_reviews_2 to\n",
226"[nltk_data] | /root/nltk_data...\n",
227"[nltk_data] | Unzipping corpora/product_reviews_2.zip.\n",
228"[nltk_data] | Downloading package pros_cons to /root/nltk_data...\n",
229"[nltk_data] | Unzipping corpora/pros_cons.zip.\n",
230"[nltk_data] | Downloading package qc to /root/nltk_data...\n",
231"[nltk_data] | Unzipping corpora/qc.zip.\n",
232"[nltk_data] | Downloading package reuters to /root/nltk_data...\n",
233"[nltk_data] | Downloading package rte to /root/nltk_data...\n",
234"[nltk_data] | Unzipping corpora/rte.zip.\n",
235"[nltk_data] | Downloading package semcor to /root/nltk_data...\n",
236"[nltk_data] | Downloading package senseval to /root/nltk_data...\n",
237"[nltk_data] | Unzipping corpora/senseval.zip.\n",
238"[nltk_data] | Downloading package sentiwordnet to\n",
239"[nltk_data] | /root/nltk_data...\n",
240"[nltk_data] | Unzipping corpora/sentiwordnet.zip.\n",
241"[nltk_data] | Downloading package sentence_polarity to\n",
242"[nltk_data] | /root/nltk_data...\n",
243"[nltk_data] | Unzipping corpora/sentence_polarity.zip.\n",
244"[nltk_data] | Downloading package shakespeare to /root/nltk_data...\n",
245"[nltk_data] | Unzipping corpora/shakespeare.zip.\n",
246"[nltk_data] | Downloading package sinica_treebank to\n",
247"[nltk_data] | /root/nltk_data...\n",
248"[nltk_data] | Unzipping corpora/sinica_treebank.zip.\n",
249"[nltk_data] | Downloading package smultron to /root/nltk_data...\n",
250"[nltk_data] | Unzipping corpora/smultron.zip.\n",
251"[nltk_data] | Downloading package state_union to /root/nltk_data...\n",
252"[nltk_data] | Unzipping corpora/state_union.zip.\n",
253"[nltk_data] | Downloading package stopwords to /root/nltk_data...\n",
254"[nltk_data] | Unzipping corpora/stopwords.zip.\n",
255"[nltk_data] | Downloading package subjectivity to\n",
256"[nltk_data] | /root/nltk_data...\n",
257"[nltk_data] | Unzipping corpora/subjectivity.zip.\n",
258"[nltk_data] | Downloading package swadesh to /root/nltk_data...\n",
259"[nltk_data] | Unzipping corpora/swadesh.zip.\n",
260"[nltk_data] | Downloading package switchboard to /root/nltk_data...\n",
261"[nltk_data] | Unzipping corpora/switchboard.zip.\n",
262"[nltk_data] | Downloading package timit to /root/nltk_data...\n",
263"[nltk_data] | Unzipping corpora/timit.zip.\n",
264"[nltk_data] | Downloading package toolbox to /root/nltk_data...\n",
265"[nltk_data] | Unzipping corpora/toolbox.zip.\n",
266"[nltk_data] | Downloading package treebank to /root/nltk_data...\n",
267"[nltk_data] | Unzipping corpora/treebank.zip.\n",
268"[nltk_data] | Downloading package twitter_samples to\n",
269"[nltk_data] | /root/nltk_data...\n",
270"[nltk_data] | Unzipping corpora/twitter_samples.zip.\n",
271"[nltk_data] | Downloading package udhr to /root/nltk_data...\n",
272"[nltk_data] | Unzipping corpora/udhr.zip.\n",
273"[nltk_data] | Downloading package udhr2 to /root/nltk_data...\n",
274"[nltk_data] | Unzipping corpora/udhr2.zip.\n",
275"[nltk_data] | Downloading package unicode_samples to\n",
276"[nltk_data] | /root/nltk_data...\n",
277"[nltk_data] | Unzipping corpora/unicode_samples.zip.\n",
278"[nltk_data] | Downloading package universal_treebanks_v20 to\n",
279"[nltk_data] | /root/nltk_data...\n",
280"[nltk_data] | Downloading package verbnet to /root/nltk_data...\n",
281"[nltk_data] | Unzipping corpora/verbnet.zip.\n",
282"[nltk_data] | Downloading package verbnet3 to /root/nltk_data...\n",
283"[nltk_data] | Unzipping corpora/verbnet3.zip.\n",
284"[nltk_data] | Downloading package webtext to /root/nltk_data...\n",
285"[nltk_data] | Unzipping corpora/webtext.zip.\n",
286"[nltk_data] | Downloading package wordnet to /root/nltk_data...\n",
287"[nltk_data] | Unzipping corpora/wordnet.zip.\n",
288"[nltk_data] | Downloading package wordnet_ic to /root/nltk_data...\n",
289"[nltk_data] | Unzipping corpora/wordnet_ic.zip.\n",
290"[nltk_data] | Downloading package words to /root/nltk_data...\n",
291"[nltk_data] | Unzipping corpora/words.zip.\n",
292"[nltk_data] | Downloading package ycoe to /root/nltk_data...\n",
293"[nltk_data] | Unzipping corpora/ycoe.zip.\n",
294"[nltk_data] | Downloading package rslp to /root/nltk_data...\n",
295"[nltk_data] | Unzipping stemmers/rslp.zip.\n",
296"[nltk_data] | Downloading package maxent_treebank_pos_tagger to\n",
297"[nltk_data] | /root/nltk_data...\n",
298"[nltk_data] | Unzipping taggers/maxent_treebank_pos_tagger.zip.\n",
299"[nltk_data] | Downloading package universal_tagset to\n",
300"[nltk_data] | /root/nltk_data...\n",
301"[nltk_data] | Unzipping taggers/universal_tagset.zip.\n",
302"[nltk_data] | Downloading package maxent_ne_chunker to\n",
303"[nltk_data] | /root/nltk_data...\n",
304"[nltk_data] | Unzipping chunkers/maxent_ne_chunker.zip.\n",
305"[nltk_data] | Downloading package punkt to /root/nltk_data...\n",
306"[nltk_data] | Unzipping tokenizers/punkt.zip.\n",
307"[nltk_data] | Downloading package book_grammars to\n",
308"[nltk_data] | /root/nltk_data...\n",
309"[nltk_data] | Unzipping grammars/book_grammars.zip.\n",
310"[nltk_data] | Downloading package sample_grammars to\n",
311"[nltk_data] | /root/nltk_data...\n",
312"[nltk_data] | Unzipping grammars/sample_grammars.zip.\n",
313"[nltk_data] | Downloading package spanish_grammars to\n",
314"[nltk_data] | /root/nltk_data...\n",
315"[nltk_data] | Unzipping grammars/spanish_grammars.zip.\n",
316"[nltk_data] | Downloading package basque_grammars to\n",
317"[nltk_data] | /root/nltk_data...\n",
318"[nltk_data] | Unzipping grammars/basque_grammars.zip.\n",
319"[nltk_data] | Downloading package large_grammars to\n",
320"[nltk_data] | /root/nltk_data...\n",
321"[nltk_data] | Unzipping grammars/large_grammars.zip.\n",
322"[nltk_data] | Downloading package tagsets to /root/nltk_data...\n",
323"[nltk_data] | Unzipping help/tagsets.zip.\n",
324"[nltk_data] | Downloading package snowball_data to\n",
325"[nltk_data] | /root/nltk_data...\n",
326"[nltk_data] | Downloading package bllip_wsj_no_aux to\n",
327"[nltk_data] | /root/nltk_data...\n",
328"[nltk_data] | Unzipping models/bllip_wsj_no_aux.zip.\n",
329"[nltk_data] | Downloading package word2vec_sample to\n",
330"[nltk_data] | /root/nltk_data...\n",
331"[nltk_data] | Unzipping models/word2vec_sample.zip.\n",
332"[nltk_data] | Downloading package panlex_swadesh to\n",
333"[nltk_data] | /root/nltk_data...\n",
334"[nltk_data] | Downloading package mte_teip5 to /root/nltk_data...\n",
335"[nltk_data] | Unzipping corpora/mte_teip5.zip.\n",
336"[nltk_data] | Downloading package averaged_perceptron_tagger to\n",
337"[nltk_data] | /root/nltk_data...\n",
338"[nltk_data] | Unzipping taggers/averaged_perceptron_tagger.zip.\n",
339"[nltk_data] | Downloading package averaged_perceptron_tagger_ru to\n",
340"[nltk_data] | /root/nltk_data...\n",
341"[nltk_data] | Unzipping\n",
342"[nltk_data] | taggers/averaged_perceptron_tagger_ru.zip.\n",
343"[nltk_data] | Downloading package perluniprops to\n",
344"[nltk_data] | /root/nltk_data...\n",
345"[nltk_data] | Unzipping misc/perluniprops.zip.\n",
346"[nltk_data] | Downloading package nonbreaking_prefixes to\n",
347"[nltk_data] | /root/nltk_data...\n",
348"[nltk_data] | Unzipping corpora/nonbreaking_prefixes.zip.\n",
349"[nltk_data] | Downloading package vader_lexicon to\n",
350"[nltk_data] | /root/nltk_data...\n",
351"[nltk_data] | Downloading package porter_test to /root/nltk_data...\n",
352"[nltk_data] | Unzipping stemmers/porter_test.zip.\n",
353"[nltk_data] | Downloading package wmt15_eval to /root/nltk_data...\n",
354"[nltk_data] | Unzipping models/wmt15_eval.zip.\n",
355"[nltk_data] | Downloading package mwa_ppdb to /root/nltk_data...\n",
356"[nltk_data] | Unzipping misc/mwa_ppdb.zip.\n",
357"[nltk_data] | \n",
358"[nltk_data] Done downloading collection all\n"
359],
360"name": "stdout"
361}
362]
363},
364{
365"cell_type": "code",
366"metadata": {
367"id": "jX1CxFXWQELt"
368},
369"source": [
370"reddit = praw.Reddit(client_id='KqCyLYQgMNwp4w', client_secret='cA9UCAPiVadgs4FTsnZ3RqJUR0hROw', user_agent='Flair-Detector', \n",
371" username='shubhigupta09', password='flair123')"
372],
373"execution_count": 3,
374"outputs": []
375},
376{
377"cell_type": "code",
378"metadata": {
379"id": "AntEsya7QcRv"
380},
381"source": [
382"subreddit = reddit.subreddit('india')"
383],
384"execution_count": 4,
385"outputs": []
386},
387{
388"cell_type": "code",
389"metadata": {
390"id": "cba--ygKQlF9"
391},
392"source": [
393"labels = {\"title\":[], \"id\":[], \"score\":[], \"url\":[], \"created\": [], \"num_of_comments\": [], \"body\":[], \"author\":[], \"comments\":[], \"flair\":[]}\n",
394"flairs = [\"AskIndia\", \"Non-Political\", \"[R]eddiquette\", \"Scheduled\", \"Photography\", \"Science/Technology\", \"Politics\", \n",
395" \"Business/Finance\", \"Policy/Economy\", \"Sports\", \"Food\", \"AMA\",\"Coronavirus\"]"
396],
397"execution_count": 5,
398"outputs": []
399},
400{
401"cell_type": "code",
402"metadata": {
403"id": "3FWTHEwwRNbj"
404},
405"source": [
406"for flair in flairs:\n",
407" \n",
408" get_subreddits = subreddit.search(flair, limit=100)\n",
409" \n",
410" for each_post in get_subreddits:\n",
411" \n",
412" labels[\"flair\"].append(flair)\n",
413" labels[\"title\"].append(each_post.title)\n",
414" labels[\"score\"].append(each_post.score)\n",
415" labels[\"id\"].append(each_post.id)\n",
416" labels[\"url\"].append(each_post.url)\n",
417" labels[\"num_of_comments\"].append(each_post.num_comments)\n",
418" labels[\"created\"].append(each_post.created)\n",
419" labels[\"body\"].append(each_post.selftext)\n",
420" labels[\"author\"].append(each_post.author)\n",
421" \n",
422" each_post.comments.replace_more(limit=None)\n",
423" comment = ''\n",
424" for top_level_comment in each_post.comments:\n",
425" comment = comment + ' ' + top_level_comment.body\n",
426" labels[\"comments\"].append(comment)"
427],
428"execution_count": 6,
429"outputs": []
430},
431{
432"cell_type": "code",
433"metadata": {
434"id": "m46lo67MR1QQ"
435},
436"source": [
437"def get_date(created):\n",
438" return dt.datetime.fromtimestamp(created)"
439],
440"execution_count": 7,
441"outputs": []
442},
443{
444"cell_type": "code",
445"metadata": {
446"id": "miLXoiW3TnoJ"
447},
448"source": [
449"data = pd.DataFrame(labels)\n",
450"time =data[\"created\"].apply(get_date)\n",
451"data =data.assign(timestamp = time)\n",
452"del data['created']\n",
453"data.to_csv('reddit-india-data.csv', index=False)"
454],
455"execution_count": 9,
456"outputs": []
457},
458{
459"cell_type": "code",
460"metadata": {
461"colab": {
462"base_uri": "https://localhost:8080/",
463"height": 700
464},
465"id": "JIDYNrXqTrwP",
466"outputId": "a93a1db9-0bbd-42b7-e495-2001e15a722a"
467},
468"source": [
469"data=pd.read_csv('reddit-india-data.csv')\n",
470"data.head()"
471],
472"execution_count": 10,
473"outputs": [
474{
475"output_type": "execute_result",
476"data": {
477"text/html": [
478"<div>\n",
479"<style scoped>\n",
480" .dataframe tbody tr th:only-of-type {\n",
481" vertical-align: middle;\n",
482" }\n",
483"\n",
484" .dataframe tbody tr th {\n",
485" vertical-align: top;\n",
486" }\n",
487"\n",
488" .dataframe thead th {\n",
489" text-align: right;\n",
490" }\n",
491"</style>\n",
492"<table border=\"1\" class=\"dataframe\">\n",
493" <thead>\n",
494" <tr style=\"text-align: right;\">\n",
495" <th></th>\n",
496" <th>title</th>\n",
497" <th>id</th>\n",
498" <th>score</th>\n",
499" <th>url</th>\n",
500" <th>num_of_comments</th>\n",
501" <th>body</th>\n",
502" <th>author</th>\n",
503" <th>comments</th>\n",
504" <th>flair</th>\n",
505" <th>timestamp</th>\n",
506" </tr>\n",
507" </thead>\n",
508" <tbody>\n",
509" <tr>\n",
510" <th>0</th>\n",
511" <td>How to approach a girl?</td>\n",
512" <td>k0qt2r</td>\n",
513" <td>3</td>\n",
514" <td>https://www.reddit.com/r/india/comments/k0qt2r...</td>\n",
515" <td>15</td>\n",
516" <td>2 years back I was working in a startup compan...</td>\n",
517" <td>covidmanbun</td>\n",
518" <td>Stop watching Indian movies. \\nStop stalking...</td>\n",
519" <td>AskIndia</td>\n",
520" <td>2020-11-25 19:50:12</td>\n",
521" </tr>\n",
522" <tr>\n",
523" <th>1</th>\n",
524" <td>Where is gelatine available ?</td>\n",
525" <td>jk9zlt</td>\n",
526" <td>0</td>\n",
527" <td>https://www.reddit.com/r/india/comments/jk9zlt...</td>\n",
528" <td>4</td>\n",
529" <td>I wish to buy gelatine and am looking for the ...</td>\n",
530" <td>csstudentG</td>\n",
531" <td>It's available in your regular kirana stores....</td>\n",
532" <td>AskIndia</td>\n",
533" <td>2020-10-29 21:26:38</td>\n",
534" </tr>\n",
535" <tr>\n",
536" <th>2</th>\n",
537" <td>Trevor Noah's jokes during Indo-Pak tensions a...</td>\n",
538" <td>gv9lmh</td>\n",
539" <td>38</td>\n",
540" <td>https://www.reddit.com/r/india/comments/gv9lmh...</td>\n",
541" <td>29</td>\n",
542" <td>I don't really watch much of Trevor Noah's lat...</td>\n",
543" <td>CommYouNitty</td>\n",
544" <td>It's mostly because we as a country don't rea...</td>\n",
545" <td>AskIndia</td>\n",
546" <td>2020-06-02 23:42:24</td>\n",
547" </tr>\n",
548" <tr>\n",
549" <th>3</th>\n",
550" <td>Need feedback for Insurance Policy that I took...</td>\n",
551" <td>1s57oi</td>\n",
552" <td>1</td>\n",
553" <td>https://www.reddit.com/r/india/comments/1s57oi...</td>\n",
554" <td>1</td>\n",
555" <td>**Re-posting here because of lack of activity ...</td>\n",
556" <td>dhavalcoholic</td>\n",
557" <td>Dear Policy Holder(Dhavalcoholic),\\n \\nWe req...</td>\n",
558" <td>AskIndia</td>\n",
559" <td>2013-12-05 14:30:23</td>\n",
560" </tr>\n",
561" <tr>\n",
562" <th>4</th>\n",
563" <td>Buying used BS4 Scooty after April 1, Is there...</td>\n",
564" <td>hpqq5o</td>\n",
565" <td>12</td>\n",
566" <td>https://www.reddit.com/r/india/comments/hpqq5o...</td>\n",
567" <td>7</td>\n",
568" <td>I don't know if it's right place to ask, but \"...</td>\n",
569" <td>akza07</td>\n",
570" <td>Scammers will keep scamming. \\n\\nIt applies t...</td>\n",
571" <td>AskIndia</td>\n",
572" <td>2020-07-12 15:58:15</td>\n",
573" </tr>\n",
574" </tbody>\n",
575"</table>\n",
576"</div>"
577],
578"text/plain": [
579" title ... timestamp\n",
580"0 How to approach a girl? ... 2020-11-25 19:50:12\n",
581"1 Where is gelatine available ? ... 2020-10-29 21:26:38\n",
582"2 Trevor Noah's jokes during Indo-Pak tensions a... ... 2020-06-02 23:42:24\n",
583"3 Need feedback for Insurance Policy that I took... ... 2013-12-05 14:30:23\n",
584"4 Buying used BS4 Scooty after April 1, Is there... ... 2020-07-12 15:58:15\n",
585"\n",
586"[5 rows x 10 columns]"
587]
588},
589"metadata": {
590"tags": []
591},
592"execution_count": 10
593}
594]
595},
596{
597"cell_type": "code",
598"metadata": {
599"id": "POhV5u6UTvAg"
600},
601"source": [
602"def string(value):\n",
603" return str(value)"
604],
605"execution_count": 11,
606"outputs": []
607},
608{
609"cell_type": "code",
610"metadata": {
611"id": "vn4gCMiCT0j_"
612},
613"source": [
614"data['title'] = string(data['title'])\n",
615"data['body'] = string( data['body'])\n",
616"data['comments'] = string(data['comments'])"
617],
618"execution_count": 12,
619"outputs": []
620},
621{
622"cell_type": "code",
623"metadata": {
624"id": "Tw1hectOT1sR"
625},
626"source": [
627"replace_by_space = re.compile('[/(){}\\[\\]\\|@,;]')\n",
628"bad_symbols = re.compile('[^0-9a-z #+_]')\n",
629"stopWords = set(stopwords.words('english'))\n",
630"def text_cleaning(text):\n",
631" \n",
632" text = BeautifulSoup(text, \"lxml\").text\n",
633" text = text.lower()\n",
634" text = replace_by_space.sub(' ', text)\n",
635" text = bad_symbols.sub('', text)\n",
636" text = ' '.join(word for word in text.split() if word not in stopWords)\n",
637" return text"
638],
639"execution_count": 13,
640"outputs": []
641},
642{
643"cell_type": "code",
644"metadata": {
645"id": "dqCNZYjXT4PM"
646},
647"source": [
648"data['title'] = data['title'].apply(text_cleaning)\n",
649"data['body'] = data['body'].apply(text_cleaning)\n",
650"data['comments'] = data['comments'].apply(text_cleaning)"
651],
652"execution_count": 16,
653"outputs": []
654},
655{
656"cell_type": "code",
657"metadata": {
658"id": "p9avmBZKUBQG"
659},
660"source": [
661"combined_features = data[\"title\"] + data[\"comments\"] + data[\"url\"] + data[\"body\"]\n",
662"data = data.assign(combined_features = combined_features)"
663],
664"execution_count": 17,
665"outputs": []
666},
667{
668"cell_type": "code",
669"metadata": {
670"id": "tuKJ0XluUDVI"
671},
672"source": [
673"data.to_csv('data.csv')"
674],
675"execution_count": 18,
676"outputs": []
677},
678{
679"cell_type": "code",
680"metadata": {
681"colab": {
682"base_uri": "https://localhost:8080/",
683"height": 1000
684},
685"id": "1GuyCluSUGKr",
686"outputId": "9f93bb24-19f5-4d10-8131-3c7594a694cc"
687},
688"source": [
689"pd.read_csv('data.csv')"
690],
691"execution_count": 19,
692"outputs": [
693{
694"output_type": "execute_result",
695"data": {
696"text/html": [
697"<div>\n",
698"<style scoped>\n",
699" .dataframe tbody tr th:only-of-type {\n",
700" vertical-align: middle;\n",
701" }\n",
702"\n",
703" .dataframe tbody tr th {\n",
704" vertical-align: top;\n",
705" }\n",
706"\n",
707" .dataframe thead th {\n",
708" text-align: right;\n",
709" }\n",
710"</style>\n",
711"<table border=\"1\" class=\"dataframe\">\n",
712" <thead>\n",
713" <tr style=\"text-align: right;\">\n",
714" <th></th>\n",
715" <th>Unnamed: 0</th>\n",
716" <th>title</th>\n",
717" <th>id</th>\n",
718" <th>score</th>\n",
719" <th>url</th>\n",
720" <th>num_of_comments</th>\n",
721" <th>body</th>\n",
722" <th>author</th>\n",
723" <th>comments</th>\n",
724" <th>flair</th>\n",
725" <th>timestamp</th>\n",
726" <th>combined_features</th>\n",
727" </tr>\n",
728" </thead>\n",
729" <tbody>\n",
730" <tr>\n",
731" <th>0</th>\n",
732" <td>0</td>\n",
733" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
734" <td>k0qt2r</td>\n",
735" <td>3</td>\n",
736" <td>https://www.reddit.com/r/india/comments/k0qt2r...</td>\n",
737" <td>15</td>\n",
738" <td>0 2 years back working startup compan1 wish bu...</td>\n",
739" <td>covidmanbun</td>\n",
740" <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
741" <td>AskIndia</td>\n",
742" <td>2020-11-25 19:50:12</td>\n",
743" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
744" </tr>\n",
745" <tr>\n",
746" <th>1</th>\n",
747" <td>1</td>\n",
748" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
749" <td>jk9zlt</td>\n",
750" <td>0</td>\n",
751" <td>https://www.reddit.com/r/india/comments/jk9zlt...</td>\n",
752" <td>4</td>\n",
753" <td>0 2 years back working startup compan1 wish bu...</td>\n",
754" <td>csstudentG</td>\n",
755" <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
756" <td>AskIndia</td>\n",
757" <td>2020-10-29 21:26:38</td>\n",
758" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
759" </tr>\n",
760" <tr>\n",
761" <th>2</th>\n",
762" <td>2</td>\n",
763" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
764" <td>gv9lmh</td>\n",
765" <td>38</td>\n",
766" <td>https://www.reddit.com/r/india/comments/gv9lmh...</td>\n",
767" <td>29</td>\n",
768" <td>0 2 years back working startup compan1 wish bu...</td>\n",
769" <td>CommYouNitty</td>\n",
770" <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
771" <td>AskIndia</td>\n",
772" <td>2020-06-02 23:42:24</td>\n",
773" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
774" </tr>\n",
775" <tr>\n",
776" <th>3</th>\n",
777" <td>3</td>\n",
778" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
779" <td>1s57oi</td>\n",
780" <td>1</td>\n",
781" <td>https://www.reddit.com/r/india/comments/1s57oi...</td>\n",
782" <td>1</td>\n",
783" <td>0 2 years back working startup compan1 wish bu...</td>\n",
784" <td>dhavalcoholic</td>\n",
785" <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
786" <td>AskIndia</td>\n",
787" <td>2013-12-05 14:30:23</td>\n",
788" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
789" </tr>\n",
790" <tr>\n",
791" <th>4</th>\n",
792" <td>4</td>\n",
793" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
794" <td>hpqq5o</td>\n",
795" <td>12</td>\n",
796" <td>https://www.reddit.com/r/india/comments/hpqq5o...</td>\n",
797" <td>7</td>\n",
798" <td>0 2 years back working startup compan1 wish bu...</td>\n",
799" <td>akza07</td>\n",
800" <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
801" <td>AskIndia</td>\n",
802" <td>2020-07-12 15:58:15</td>\n",
803" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
804" </tr>\n",
805" <tr>\n",
806" <th>...</th>\n",
807" <td>...</td>\n",
808" <td>...</td>\n",
809" <td>...</td>\n",
810" <td>...</td>\n",
811" <td>...</td>\n",
812" <td>...</td>\n",
813" <td>...</td>\n",
814" <td>...</td>\n",
815" <td>...</td>\n",
816" <td>...</td>\n",
817" <td>...</td>\n",
818" <td>...</td>\n",
819" </tr>\n",
820" <tr>\n",
821" <th>1211</th>\n",
822" <td>1211</td>\n",
823" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
824" <td>j002jl</td>\n",
825" <td>323</td>\n",
826" <td>https://thewire.in/politics/bjp-bihar-election...</td>\n",
827" <td>43</td>\n",
828" <td>0 2 years back working startup compan1 wish bu...</td>\n",
829" <td>mubukugrappa</td>\n",
830" <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
831" <td>Coronavirus</td>\n",
832" <td>2020-09-26 13:23:26</td>\n",
833" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
834" </tr>\n",
835" <tr>\n",
836" <th>1212</th>\n",
837" <td>1212</td>\n",
838" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
839" <td>j0wucw</td>\n",
840" <td>0</td>\n",
841" <td>https://www.reddit.com/r/india/comments/j0wucw...</td>\n",
842" <td>0</td>\n",
843" <td>0 2 years back working startup compan1 wish bu...</td>\n",
844" <td>pizzapuff93</td>\n",
845" <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
846" <td>Coronavirus</td>\n",
847" <td>2020-09-28 03:08:39</td>\n",
848" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
849" </tr>\n",
850" <tr>\n",
851" <th>1213</th>\n",
852" <td>1213</td>\n",
853" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
854" <td>izx7aw</td>\n",
855" <td>5</td>\n",
856" <td>https://www.reddit.com/r/india/comments/izx7aw...</td>\n",
857" <td>1</td>\n",
858" <td>0 2 years back working startup compan1 wish bu...</td>\n",
859" <td>spddgr8</td>\n",
860" <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
861" <td>Coronavirus</td>\n",
862" <td>2020-09-26 10:12:50</td>\n",
863" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
864" </tr>\n",
865" <tr>\n",
866" <th>1214</th>\n",
867" <td>1214</td>\n",
868" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
869" <td>ix6pl0</td>\n",
870" <td>1020</td>\n",
871" <td>https://www.reddit.com/gallery/ix6pl0</td>\n",
872" <td>153</td>\n",
873" <td>0 2 years back working startup compan1 wish bu...</td>\n",
874" <td>IndianPuppy</td>\n",
875" <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
876" <td>Coronavirus</td>\n",
877" <td>2020-09-22 02:55:01</td>\n",
878" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
879" </tr>\n",
880" <tr>\n",
881" <th>1215</th>\n",
882" <td>1215</td>\n",
883" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
884" <td>iwafpe</td>\n",
885" <td>289</td>\n",
886" <td>https://theconversation.com/india-why-secrecy-...</td>\n",
887" <td>9</td>\n",
888" <td>0 2 years back working startup compan1 wish bu...</td>\n",
889" <td>9kSs</td>\n",
890" <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
891" <td>Coronavirus</td>\n",
892" <td>2020-09-20 16:56:43</td>\n",
893" <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
894" </tr>\n",
895" </tbody>\n",
896"</table>\n",
897"<p>1216 rows × 12 columns</p>\n",
898"</div>"
899],
900"text/plain": [
901" Unnamed: 0 ... combined_features\n",
902"0 0 ... 0 approach girl1 gelatine available 2 trevor n...\n",
903"1 1 ... 0 approach girl1 gelatine available 2 trevor n...\n",
904"2 2 ... 0 approach girl1 gelatine available 2 trevor n...\n",
905"3 3 ... 0 approach girl1 gelatine available 2 trevor n...\n",
906"4 4 ... 0 approach girl1 gelatine available 2 trevor n...\n",
907"... ... ... ...\n",
908"1211 1211 ... 0 approach girl1 gelatine available 2 trevor n...\n",
909"1212 1212 ... 0 approach girl1 gelatine available 2 trevor n...\n",
910"1213 1213 ... 0 approach girl1 gelatine available 2 trevor n...\n",
911"1214 1214 ... 0 approach girl1 gelatine available 2 trevor n...\n",
912"1215 1215 ... 0 approach girl1 gelatine available 2 trevor n...\n",
913"\n",
914"[1216 rows x 12 columns]"
915]
916},
917"metadata": {
918"tags": []
919},
920"execution_count": 19
921}
922]
923},
924{
925"cell_type": "code",
926"metadata": {
927"id": "9hUqQIICwtIt"
928},
929"source": [
930""
931],
932"execution_count": null,
933"outputs": []
934}
935]
936}