Amazing-Python-Scripts

WebScrapping and PreProcessing.ipynb
936 строк · 46.1 Кб
Перенос по словам
1
{
2
  "nbformat": 4,
3
  "nbformat_minor": 0,
4
  "metadata": {
5
    "kernelspec": {
6
      "display_name": "Python 3",
7
      "language": "python",
8
      "name": "python3"
9
    },
10
    "language_info": {
11
      "codemirror_mode": {
12
        "name": "ipython",
13
        "version": 3
14
      },
15
      "file_extension": ".py",
16
      "mimetype": "text/x-python",
17
      "name": "python",
18
      "nbconvert_exporter": "python",
19
      "pygments_lexer": "ipython3",
20
      "version": "3.8.3"
21
    },
22
    "colab": {
23
      "name": "WebScrapping and PreProcessing.ipynb",
24
      "provenance": [],
25
      "toc_visible": true,
26
      "include_colab_link": true
27
    }
28
  },
29
  "cells": [
30
    {
31
      "cell_type": "markdown",
32
      "metadata": {
33
        "id": "view-in-github",
34
        "colab_type": "text"
35
      },
36
      "source": [
37
        "<a href=\"https://colab.research.google.com/github/shubhigupta991/Reddit-Flair-Detection/blob/main/scripts/WebScrapping%20and%20PreProcessing.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
38
      ]
39
    },
40
    {
41
      "cell_type": "markdown",
42
      "metadata": {
43
        "id": "6dtwbrrXdGcx"
44
      },
45
      "source": [
46
        "# Collecting the data from reddit\n",
47
        "\n",
48
        "We are collecting the data with the help of PRAW\n",
49
        "\n",
50
        "PRAW stands for Python Reddit API Wrapper."
51
      ]
52
    },
53
    {
54
      "cell_type": "code",
55
      "metadata": {
56
        "colab": {
57
          "base_uri": "https://localhost:8080/"
58
        },
59
        "id": "xjMXAytddIoJ",
60
        "outputId": "c21b125c-d1e9-48ac-ba70-d99dc818298d"
61
      },
62
      "source": [
63
        "!pip install praw"
64
      ],
65
      "execution_count": 1,
66
      "outputs": [
67
        {
68
          "output_type": "stream",
69
          "text": [
70
            "Collecting praw\n",
71
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/2c/15/4bcc44271afce0316c73cd2ed35f951f1363a07d4d5d5440ae5eb2baad78/praw-7.1.0-py3-none-any.whl (152kB)\n",
72
            "\r\u001b[K     |██▏                             | 10kB 18.0MB/s eta 0:00:01\r\u001b[K     |████▎                           | 20kB 18.7MB/s eta 0:00:01\r\u001b[K     |██████▌                         | 30kB 15.1MB/s eta 0:00:01\r\u001b[K     |████████▋                       | 40kB 9.1MB/s eta 0:00:01\r\u001b[K     |██████████▊                     | 51kB 10.8MB/s eta 0:00:01\r\u001b[K     |█████████████                   | 61kB 12.3MB/s eta 0:00:01\r\u001b[K     |███████████████                 | 71kB 11.5MB/s eta 0:00:01\r\u001b[K     |█████████████████▎              | 81kB 12.3MB/s eta 0:00:01\r\u001b[K     |███████████████████▍            | 92kB 12.1MB/s eta 0:00:01\r\u001b[K     |█████████████████████▌          | 102kB 10.9MB/s eta 0:00:01\r\u001b[K     |███████████████████████▊        | 112kB 10.9MB/s eta 0:00:01\r\u001b[K     |█████████████████████████▉      | 122kB 10.9MB/s eta 0:00:01\r\u001b[K     |████████████████████████████    | 133kB 10.9MB/s eta 0:00:01\r\u001b[K     |██████████████████████████████▏ | 143kB 10.9MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 153kB 10.9MB/s \n",
73
            "\u001b[?25hCollecting websocket-client>=0.54.0\n",
74
            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/4c/5f/f61b420143ed1c8dc69f9eaec5ff1ac36109d52c80de49d66e0c36c3dfdf/websocket_client-0.57.0-py2.py3-none-any.whl (200kB)\n",
75
            "\u001b[K     |████████████████████████████████| 204kB 19.9MB/s \n",
76
            "\u001b[?25hCollecting update-checker>=0.17\n",
77
            "  Downloading https://files.pythonhosted.org/packages/0c/ba/8dd7fa5f0b1c6a8ac62f8f57f7e794160c1f86f31c6d0fb00f582372a3e4/update_checker-0.18.0-py3-none-any.whl\n",
78
            "Collecting prawcore<2.0,>=1.3.0\n",
79
            "  Downloading https://files.pythonhosted.org/packages/1d/40/b741437ce4c7b64f928513817b29c0a615efb66ab5e5e01f66fe92d2d95b/prawcore-1.5.0-py3-none-any.whl\n",
80
            "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from websocket-client>=0.54.0->praw) (1.15.0)\n",
81
            "Requirement already satisfied: requests>=2.3.0 in /usr/local/lib/python3.6/dist-packages (from update-checker>=0.17->praw) (2.23.0)\n",
82
            "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.17->praw) (2.10)\n",
83
            "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.17->praw) (3.0.4)\n",
84
            "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.17->praw) (1.24.3)\n",
85
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.17->praw) (2020.11.8)\n",
86
            "Installing collected packages: websocket-client, update-checker, prawcore, praw\n",
87
            "Successfully installed praw-7.1.0 prawcore-1.5.0 update-checker-0.18.0 websocket-client-0.57.0\n"
88
          ],
89
          "name": "stdout"
90
        }
91
      ]
92
    },
93
    {
94
      "cell_type": "code",
95
      "metadata": {
96
        "colab": {
97
          "base_uri": "https://localhost:8080/"
98
        },
99
        "id": "dGSFaRr4PyEN",
100
        "outputId": "fa179cad-ea22-4ae8-c897-6832de9fd044"
101
      },
102
      "source": [
103
        "import praw\n",
104
        "import pandas as pd\n",
105
        "import numpy as np \n",
106
        "import re\n",
107
        "import nltk\n",
108
        "from nltk.corpus import stopwords\n",
109
        "import datetime as dt\n",
110
        "nltk.download('all')\n",
111
        "from bs4 import BeautifulSoup"
112
      ],
113
      "execution_count": 2,
114
      "outputs": [
115
        {
116
          "output_type": "stream",
117
          "text": [
118
            "[nltk_data] Downloading collection 'all'\n",
119
            "[nltk_data]    | \n",
120
            "[nltk_data]    | Downloading package abc to /root/nltk_data...\n",
121
            "[nltk_data]    |   Unzipping corpora/abc.zip.\n",
122
            "[nltk_data]    | Downloading package alpino to /root/nltk_data...\n",
123
            "[nltk_data]    |   Unzipping corpora/alpino.zip.\n",
124
            "[nltk_data]    | Downloading package biocreative_ppi to\n",
125
            "[nltk_data]    |     /root/nltk_data...\n",
126
            "[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.\n",
127
            "[nltk_data]    | Downloading package brown to /root/nltk_data...\n",
128
            "[nltk_data]    |   Unzipping corpora/brown.zip.\n",
129
            "[nltk_data]    | Downloading package brown_tei to /root/nltk_data...\n",
130
            "[nltk_data]    |   Unzipping corpora/brown_tei.zip.\n",
131
            "[nltk_data]    | Downloading package cess_cat to /root/nltk_data...\n",
132
            "[nltk_data]    |   Unzipping corpora/cess_cat.zip.\n",
133
            "[nltk_data]    | Downloading package cess_esp to /root/nltk_data...\n",
134
            "[nltk_data]    |   Unzipping corpora/cess_esp.zip.\n",
135
            "[nltk_data]    | Downloading package chat80 to /root/nltk_data...\n",
136
            "[nltk_data]    |   Unzipping corpora/chat80.zip.\n",
137
            "[nltk_data]    | Downloading package city_database to\n",
138
            "[nltk_data]    |     /root/nltk_data...\n",
139
            "[nltk_data]    |   Unzipping corpora/city_database.zip.\n",
140
            "[nltk_data]    | Downloading package cmudict to /root/nltk_data...\n",
141
            "[nltk_data]    |   Unzipping corpora/cmudict.zip.\n",
142
            "[nltk_data]    | Downloading package comparative_sentences to\n",
143
            "[nltk_data]    |     /root/nltk_data...\n",
144
            "[nltk_data]    |   Unzipping corpora/comparative_sentences.zip.\n",
145
            "[nltk_data]    | Downloading package comtrans to /root/nltk_data...\n",
146
            "[nltk_data]    | Downloading package conll2000 to /root/nltk_data...\n",
147
            "[nltk_data]    |   Unzipping corpora/conll2000.zip.\n",
148
            "[nltk_data]    | Downloading package conll2002 to /root/nltk_data...\n",
149
            "[nltk_data]    |   Unzipping corpora/conll2002.zip.\n",
150
            "[nltk_data]    | Downloading package conll2007 to /root/nltk_data...\n",
151
            "[nltk_data]    | Downloading package crubadan to /root/nltk_data...\n",
152
            "[nltk_data]    |   Unzipping corpora/crubadan.zip.\n",
153
            "[nltk_data]    | Downloading package dependency_treebank to\n",
154
            "[nltk_data]    |     /root/nltk_data...\n",
155
            "[nltk_data]    |   Unzipping corpora/dependency_treebank.zip.\n",
156
            "[nltk_data]    | Downloading package dolch to /root/nltk_data...\n",
157
            "[nltk_data]    |   Unzipping corpora/dolch.zip.\n",
158
            "[nltk_data]    | Downloading package europarl_raw to\n",
159
            "[nltk_data]    |     /root/nltk_data...\n",
160
            "[nltk_data]    |   Unzipping corpora/europarl_raw.zip.\n",
161
            "[nltk_data]    | Downloading package floresta to /root/nltk_data...\n",
162
            "[nltk_data]    |   Unzipping corpora/floresta.zip.\n",
163
            "[nltk_data]    | Downloading package framenet_v15 to\n",
164
            "[nltk_data]    |     /root/nltk_data...\n",
165
            "[nltk_data]    |   Unzipping corpora/framenet_v15.zip.\n",
166
            "[nltk_data]    | Downloading package framenet_v17 to\n",
167
            "[nltk_data]    |     /root/nltk_data...\n",
168
            "[nltk_data]    |   Unzipping corpora/framenet_v17.zip.\n",
169
            "[nltk_data]    | Downloading package gazetteers to /root/nltk_data...\n",
170
            "[nltk_data]    |   Unzipping corpora/gazetteers.zip.\n",
171
            "[nltk_data]    | Downloading package genesis to /root/nltk_data...\n",
172
            "[nltk_data]    |   Unzipping corpora/genesis.zip.\n",
173
            "[nltk_data]    | Downloading package gutenberg to /root/nltk_data...\n",
174
            "[nltk_data]    |   Unzipping corpora/gutenberg.zip.\n",
175
            "[nltk_data]    | Downloading package ieer to /root/nltk_data...\n",
176
            "[nltk_data]    |   Unzipping corpora/ieer.zip.\n",
177
            "[nltk_data]    | Downloading package inaugural to /root/nltk_data...\n",
178
            "[nltk_data]    |   Unzipping corpora/inaugural.zip.\n",
179
            "[nltk_data]    | Downloading package indian to /root/nltk_data...\n",
180
            "[nltk_data]    |   Unzipping corpora/indian.zip.\n",
181
            "[nltk_data]    | Downloading package jeita to /root/nltk_data...\n",
182
            "[nltk_data]    | Downloading package kimmo to /root/nltk_data...\n",
183
            "[nltk_data]    |   Unzipping corpora/kimmo.zip.\n",
184
            "[nltk_data]    | Downloading package knbc to /root/nltk_data...\n",
185
            "[nltk_data]    | Downloading package lin_thesaurus to\n",
186
            "[nltk_data]    |     /root/nltk_data...\n",
187
            "[nltk_data]    |   Unzipping corpora/lin_thesaurus.zip.\n",
188
            "[nltk_data]    | Downloading package mac_morpho to /root/nltk_data...\n",
189
            "[nltk_data]    |   Unzipping corpora/mac_morpho.zip.\n",
190
            "[nltk_data]    | Downloading package machado to /root/nltk_data...\n",
191
            "[nltk_data]    | Downloading package masc_tagged to /root/nltk_data...\n",
192
            "[nltk_data]    | Downloading package moses_sample to\n",
193
            "[nltk_data]    |     /root/nltk_data...\n",
194
            "[nltk_data]    |   Unzipping models/moses_sample.zip.\n",
195
            "[nltk_data]    | Downloading package movie_reviews to\n",
196
            "[nltk_data]    |     /root/nltk_data...\n",
197
            "[nltk_data]    |   Unzipping corpora/movie_reviews.zip.\n",
198
            "[nltk_data]    | Downloading package names to /root/nltk_data...\n",
199
            "[nltk_data]    |   Unzipping corpora/names.zip.\n",
200
            "[nltk_data]    | Downloading package nombank.1.0 to /root/nltk_data...\n",
201
            "[nltk_data]    | Downloading package nps_chat to /root/nltk_data...\n",
202
            "[nltk_data]    |   Unzipping corpora/nps_chat.zip.\n",
203
            "[nltk_data]    | Downloading package omw to /root/nltk_data...\n",
204
            "[nltk_data]    |   Unzipping corpora/omw.zip.\n",
205
            "[nltk_data]    | Downloading package opinion_lexicon to\n",
206
            "[nltk_data]    |     /root/nltk_data...\n",
207
            "[nltk_data]    |   Unzipping corpora/opinion_lexicon.zip.\n",
208
            "[nltk_data]    | Downloading package paradigms to /root/nltk_data...\n",
209
            "[nltk_data]    |   Unzipping corpora/paradigms.zip.\n",
210
            "[nltk_data]    | Downloading package pil to /root/nltk_data...\n",
211
            "[nltk_data]    |   Unzipping corpora/pil.zip.\n",
212
            "[nltk_data]    | Downloading package pl196x to /root/nltk_data...\n",
213
            "[nltk_data]    |   Unzipping corpora/pl196x.zip.\n",
214
            "[nltk_data]    | Downloading package ppattach to /root/nltk_data...\n",
215
            "[nltk_data]    |   Unzipping corpora/ppattach.zip.\n",
216
            "[nltk_data]    | Downloading package problem_reports to\n",
217
            "[nltk_data]    |     /root/nltk_data...\n",
218
            "[nltk_data]    |   Unzipping corpora/problem_reports.zip.\n",
219
            "[nltk_data]    | Downloading package propbank to /root/nltk_data...\n",
220
            "[nltk_data]    | Downloading package ptb to /root/nltk_data...\n",
221
            "[nltk_data]    |   Unzipping corpora/ptb.zip.\n",
222
            "[nltk_data]    | Downloading package product_reviews_1 to\n",
223
            "[nltk_data]    |     /root/nltk_data...\n",
224
            "[nltk_data]    |   Unzipping corpora/product_reviews_1.zip.\n",
225
            "[nltk_data]    | Downloading package product_reviews_2 to\n",
226
            "[nltk_data]    |     /root/nltk_data...\n",
227
            "[nltk_data]    |   Unzipping corpora/product_reviews_2.zip.\n",
228
            "[nltk_data]    | Downloading package pros_cons to /root/nltk_data...\n",
229
            "[nltk_data]    |   Unzipping corpora/pros_cons.zip.\n",
230
            "[nltk_data]    | Downloading package qc to /root/nltk_data...\n",
231
            "[nltk_data]    |   Unzipping corpora/qc.zip.\n",
232
            "[nltk_data]    | Downloading package reuters to /root/nltk_data...\n",
233
            "[nltk_data]    | Downloading package rte to /root/nltk_data...\n",
234
            "[nltk_data]    |   Unzipping corpora/rte.zip.\n",
235
            "[nltk_data]    | Downloading package semcor to /root/nltk_data...\n",
236
            "[nltk_data]    | Downloading package senseval to /root/nltk_data...\n",
237
            "[nltk_data]    |   Unzipping corpora/senseval.zip.\n",
238
            "[nltk_data]    | Downloading package sentiwordnet to\n",
239
            "[nltk_data]    |     /root/nltk_data...\n",
240
            "[nltk_data]    |   Unzipping corpora/sentiwordnet.zip.\n",
241
            "[nltk_data]    | Downloading package sentence_polarity to\n",
242
            "[nltk_data]    |     /root/nltk_data...\n",
243
            "[nltk_data]    |   Unzipping corpora/sentence_polarity.zip.\n",
244
            "[nltk_data]    | Downloading package shakespeare to /root/nltk_data...\n",
245
            "[nltk_data]    |   Unzipping corpora/shakespeare.zip.\n",
246
            "[nltk_data]    | Downloading package sinica_treebank to\n",
247
            "[nltk_data]    |     /root/nltk_data...\n",
248
            "[nltk_data]    |   Unzipping corpora/sinica_treebank.zip.\n",
249
            "[nltk_data]    | Downloading package smultron to /root/nltk_data...\n",
250
            "[nltk_data]    |   Unzipping corpora/smultron.zip.\n",
251
            "[nltk_data]    | Downloading package state_union to /root/nltk_data...\n",
252
            "[nltk_data]    |   Unzipping corpora/state_union.zip.\n",
253
            "[nltk_data]    | Downloading package stopwords to /root/nltk_data...\n",
254
            "[nltk_data]    |   Unzipping corpora/stopwords.zip.\n",
255
            "[nltk_data]    | Downloading package subjectivity to\n",
256
            "[nltk_data]    |     /root/nltk_data...\n",
257
            "[nltk_data]    |   Unzipping corpora/subjectivity.zip.\n",
258
            "[nltk_data]    | Downloading package swadesh to /root/nltk_data...\n",
259
            "[nltk_data]    |   Unzipping corpora/swadesh.zip.\n",
260
            "[nltk_data]    | Downloading package switchboard to /root/nltk_data...\n",
261
            "[nltk_data]    |   Unzipping corpora/switchboard.zip.\n",
262
            "[nltk_data]    | Downloading package timit to /root/nltk_data...\n",
263
            "[nltk_data]    |   Unzipping corpora/timit.zip.\n",
264
            "[nltk_data]    | Downloading package toolbox to /root/nltk_data...\n",
265
            "[nltk_data]    |   Unzipping corpora/toolbox.zip.\n",
266
            "[nltk_data]    | Downloading package treebank to /root/nltk_data...\n",
267
            "[nltk_data]    |   Unzipping corpora/treebank.zip.\n",
268
            "[nltk_data]    | Downloading package twitter_samples to\n",
269
            "[nltk_data]    |     /root/nltk_data...\n",
270
            "[nltk_data]    |   Unzipping corpora/twitter_samples.zip.\n",
271
            "[nltk_data]    | Downloading package udhr to /root/nltk_data...\n",
272
            "[nltk_data]    |   Unzipping corpora/udhr.zip.\n",
273
            "[nltk_data]    | Downloading package udhr2 to /root/nltk_data...\n",
274
            "[nltk_data]    |   Unzipping corpora/udhr2.zip.\n",
275
            "[nltk_data]    | Downloading package unicode_samples to\n",
276
            "[nltk_data]    |     /root/nltk_data...\n",
277
            "[nltk_data]    |   Unzipping corpora/unicode_samples.zip.\n",
278
            "[nltk_data]    | Downloading package universal_treebanks_v20 to\n",
279
            "[nltk_data]    |     /root/nltk_data...\n",
280
            "[nltk_data]    | Downloading package verbnet to /root/nltk_data...\n",
281
            "[nltk_data]    |   Unzipping corpora/verbnet.zip.\n",
282
            "[nltk_data]    | Downloading package verbnet3 to /root/nltk_data...\n",
283
            "[nltk_data]    |   Unzipping corpora/verbnet3.zip.\n",
284
            "[nltk_data]    | Downloading package webtext to /root/nltk_data...\n",
285
            "[nltk_data]    |   Unzipping corpora/webtext.zip.\n",
286
            "[nltk_data]    | Downloading package wordnet to /root/nltk_data...\n",
287
            "[nltk_data]    |   Unzipping corpora/wordnet.zip.\n",
288
            "[nltk_data]    | Downloading package wordnet_ic to /root/nltk_data...\n",
289
            "[nltk_data]    |   Unzipping corpora/wordnet_ic.zip.\n",
290
            "[nltk_data]    | Downloading package words to /root/nltk_data...\n",
291
            "[nltk_data]    |   Unzipping corpora/words.zip.\n",
292
            "[nltk_data]    | Downloading package ycoe to /root/nltk_data...\n",
293
            "[nltk_data]    |   Unzipping corpora/ycoe.zip.\n",
294
            "[nltk_data]    | Downloading package rslp to /root/nltk_data...\n",
295
            "[nltk_data]    |   Unzipping stemmers/rslp.zip.\n",
296
            "[nltk_data]    | Downloading package maxent_treebank_pos_tagger to\n",
297
            "[nltk_data]    |     /root/nltk_data...\n",
298
            "[nltk_data]    |   Unzipping taggers/maxent_treebank_pos_tagger.zip.\n",
299
            "[nltk_data]    | Downloading package universal_tagset to\n",
300
            "[nltk_data]    |     /root/nltk_data...\n",
301
            "[nltk_data]    |   Unzipping taggers/universal_tagset.zip.\n",
302
            "[nltk_data]    | Downloading package maxent_ne_chunker to\n",
303
            "[nltk_data]    |     /root/nltk_data...\n",
304
            "[nltk_data]    |   Unzipping chunkers/maxent_ne_chunker.zip.\n",
305
            "[nltk_data]    | Downloading package punkt to /root/nltk_data...\n",
306
            "[nltk_data]    |   Unzipping tokenizers/punkt.zip.\n",
307
            "[nltk_data]    | Downloading package book_grammars to\n",
308
            "[nltk_data]    |     /root/nltk_data...\n",
309
            "[nltk_data]    |   Unzipping grammars/book_grammars.zip.\n",
310
            "[nltk_data]    | Downloading package sample_grammars to\n",
311
            "[nltk_data]    |     /root/nltk_data...\n",
312
            "[nltk_data]    |   Unzipping grammars/sample_grammars.zip.\n",
313
            "[nltk_data]    | Downloading package spanish_grammars to\n",
314
            "[nltk_data]    |     /root/nltk_data...\n",
315
            "[nltk_data]    |   Unzipping grammars/spanish_grammars.zip.\n",
316
            "[nltk_data]    | Downloading package basque_grammars to\n",
317
            "[nltk_data]    |     /root/nltk_data...\n",
318
            "[nltk_data]    |   Unzipping grammars/basque_grammars.zip.\n",
319
            "[nltk_data]    | Downloading package large_grammars to\n",
320
            "[nltk_data]    |     /root/nltk_data...\n",
321
            "[nltk_data]    |   Unzipping grammars/large_grammars.zip.\n",
322
            "[nltk_data]    | Downloading package tagsets to /root/nltk_data...\n",
323
            "[nltk_data]    |   Unzipping help/tagsets.zip.\n",
324
            "[nltk_data]    | Downloading package snowball_data to\n",
325
            "[nltk_data]    |     /root/nltk_data...\n",
326
            "[nltk_data]    | Downloading package bllip_wsj_no_aux to\n",
327
            "[nltk_data]    |     /root/nltk_data...\n",
328
            "[nltk_data]    |   Unzipping models/bllip_wsj_no_aux.zip.\n",
329
            "[nltk_data]    | Downloading package word2vec_sample to\n",
330
            "[nltk_data]    |     /root/nltk_data...\n",
331
            "[nltk_data]    |   Unzipping models/word2vec_sample.zip.\n",
332
            "[nltk_data]    | Downloading package panlex_swadesh to\n",
333
            "[nltk_data]    |     /root/nltk_data...\n",
334
            "[nltk_data]    | Downloading package mte_teip5 to /root/nltk_data...\n",
335
            "[nltk_data]    |   Unzipping corpora/mte_teip5.zip.\n",
336
            "[nltk_data]    | Downloading package averaged_perceptron_tagger to\n",
337
            "[nltk_data]    |     /root/nltk_data...\n",
338
            "[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.\n",
339
            "[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to\n",
340
            "[nltk_data]    |     /root/nltk_data...\n",
341
            "[nltk_data]    |   Unzipping\n",
342
            "[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.\n",
343
            "[nltk_data]    | Downloading package perluniprops to\n",
344
            "[nltk_data]    |     /root/nltk_data...\n",
345
            "[nltk_data]    |   Unzipping misc/perluniprops.zip.\n",
346
            "[nltk_data]    | Downloading package nonbreaking_prefixes to\n",
347
            "[nltk_data]    |     /root/nltk_data...\n",
348
            "[nltk_data]    |   Unzipping corpora/nonbreaking_prefixes.zip.\n",
349
            "[nltk_data]    | Downloading package vader_lexicon to\n",
350
            "[nltk_data]    |     /root/nltk_data...\n",
351
            "[nltk_data]    | Downloading package porter_test to /root/nltk_data...\n",
352
            "[nltk_data]    |   Unzipping stemmers/porter_test.zip.\n",
353
            "[nltk_data]    | Downloading package wmt15_eval to /root/nltk_data...\n",
354
            "[nltk_data]    |   Unzipping models/wmt15_eval.zip.\n",
355
            "[nltk_data]    | Downloading package mwa_ppdb to /root/nltk_data...\n",
356
            "[nltk_data]    |   Unzipping misc/mwa_ppdb.zip.\n",
357
            "[nltk_data]    | \n",
358
            "[nltk_data]  Done downloading collection all\n"
359
          ],
360
          "name": "stdout"
361
        }
362
      ]
363
    },
364
    {
365
      "cell_type": "code",
366
      "metadata": {
367
        "id": "jX1CxFXWQELt"
368
      },
369
      "source": [
370
        "reddit = praw.Reddit(client_id='KqCyLYQgMNwp4w', client_secret='cA9UCAPiVadgs4FTsnZ3RqJUR0hROw', user_agent='Flair-Detector', \n",
371
        "                     username='shubhigupta09', password='flair123')"
372
      ],
373
      "execution_count": 3,
374
      "outputs": []
375
    },
376
    {
377
      "cell_type": "code",
378
      "metadata": {
379
        "id": "AntEsya7QcRv"
380
      },
381
      "source": [
382
        "subreddit = reddit.subreddit('india')"
383
      ],
384
      "execution_count": 4,
385
      "outputs": []
386
    },
387
    {
388
      "cell_type": "code",
389
      "metadata": {
390
        "id": "cba--ygKQlF9"
391
      },
392
      "source": [
393
        "labels = {\"title\":[], \"id\":[], \"score\":[], \"url\":[], \"created\": [], \"num_of_comments\": [], \"body\":[], \"author\":[], \"comments\":[], \"flair\":[]}\n",
394
        "flairs = [\"AskIndia\", \"Non-Political\", \"[R]eddiquette\", \"Scheduled\", \"Photography\", \"Science/Technology\", \"Politics\", \n",
395
        "          \"Business/Finance\", \"Policy/Economy\", \"Sports\", \"Food\", \"AMA\",\"Coronavirus\"]"
396
      ],
397
      "execution_count": 5,
398
      "outputs": []
399
    },
400
    {
401
      "cell_type": "code",
402
      "metadata": {
403
        "id": "3FWTHEwwRNbj"
404
      },
405
      "source": [
406
        "for flair in flairs:\n",
407
        "  \n",
408
        "  get_subreddits = subreddit.search(flair, limit=100)\n",
409
        "  \n",
410
        "  for each_post in get_subreddits:\n",
411
        "    \n",
412
        "    labels[\"flair\"].append(flair)\n",
413
        "    labels[\"title\"].append(each_post.title)\n",
414
        "    labels[\"score\"].append(each_post.score)\n",
415
        "    labels[\"id\"].append(each_post.id)\n",
416
        "    labels[\"url\"].append(each_post.url)\n",
417
        "    labels[\"num_of_comments\"].append(each_post.num_comments)\n",
418
        "    labels[\"created\"].append(each_post.created)\n",
419
        "    labels[\"body\"].append(each_post.selftext)\n",
420
        "    labels[\"author\"].append(each_post.author)\n",
421
        "    \n",
422
        "    each_post.comments.replace_more(limit=None)\n",
423
        "    comment = ''\n",
424
        "    for top_level_comment in each_post.comments:\n",
425
        "      comment = comment + ' ' + top_level_comment.body\n",
426
        "    labels[\"comments\"].append(comment)"
427
      ],
428
      "execution_count": 6,
429
      "outputs": []
430
    },
431
    {
432
      "cell_type": "code",
433
      "metadata": {
434
        "id": "m46lo67MR1QQ"
435
      },
436
      "source": [
437
        "def get_date(created):\n",
438
        "    return dt.datetime.fromtimestamp(created)"
439
      ],
440
      "execution_count": 7,
441
      "outputs": []
442
    },
443
    {
444
      "cell_type": "code",
445
      "metadata": {
446
        "id": "miLXoiW3TnoJ"
447
      },
448
      "source": [
449
        "data = pd.DataFrame(labels)\n",
450
        "time =data[\"created\"].apply(get_date)\n",
451
        "data =data.assign(timestamp = time)\n",
452
        "del data['created']\n",
453
        "data.to_csv('reddit-india-data.csv', index=False)"
454
      ],
455
      "execution_count": 9,
456
      "outputs": []
457
    },
458
    {
459
      "cell_type": "code",
460
      "metadata": {
461
        "colab": {
462
          "base_uri": "https://localhost:8080/",
463
          "height": 700
464
        },
465
        "id": "JIDYNrXqTrwP",
466
        "outputId": "a93a1db9-0bbd-42b7-e495-2001e15a722a"
467
      },
468
      "source": [
469
        "data=pd.read_csv('reddit-india-data.csv')\n",
470
        "data.head()"
471
      ],
472
      "execution_count": 10,
473
      "outputs": [
474
        {
475
          "output_type": "execute_result",
476
          "data": {
477
            "text/html": [
478
              "<div>\n",
479
              "<style scoped>\n",
480
              "    .dataframe tbody tr th:only-of-type {\n",
481
              "        vertical-align: middle;\n",
482
              "    }\n",
483
              "\n",
484
              "    .dataframe tbody tr th {\n",
485
              "        vertical-align: top;\n",
486
              "    }\n",
487
              "\n",
488
              "    .dataframe thead th {\n",
489
              "        text-align: right;\n",
490
              "    }\n",
491
              "</style>\n",
492
              "<table border=\"1\" class=\"dataframe\">\n",
493
              "  <thead>\n",
494
              "    <tr style=\"text-align: right;\">\n",
495
              "      <th></th>\n",
496
              "      <th>title</th>\n",
497
              "      <th>id</th>\n",
498
              "      <th>score</th>\n",
499
              "      <th>url</th>\n",
500
              "      <th>num_of_comments</th>\n",
501
              "      <th>body</th>\n",
502
              "      <th>author</th>\n",
503
              "      <th>comments</th>\n",
504
              "      <th>flair</th>\n",
505
              "      <th>timestamp</th>\n",
506
              "    </tr>\n",
507
              "  </thead>\n",
508
              "  <tbody>\n",
509
              "    <tr>\n",
510
              "      <th>0</th>\n",
511
              "      <td>How to approach a girl?</td>\n",
512
              "      <td>k0qt2r</td>\n",
513
              "      <td>3</td>\n",
514
              "      <td>https://www.reddit.com/r/india/comments/k0qt2r...</td>\n",
515
              "      <td>15</td>\n",
516
              "      <td>2 years back I was working in a startup compan...</td>\n",
517
              "      <td>covidmanbun</td>\n",
518
              "      <td>Stop watching Indian movies.  \\nStop stalking...</td>\n",
519
              "      <td>AskIndia</td>\n",
520
              "      <td>2020-11-25 19:50:12</td>\n",
521
              "    </tr>\n",
522
              "    <tr>\n",
523
              "      <th>1</th>\n",
524
              "      <td>Where is gelatine available ?</td>\n",
525
              "      <td>jk9zlt</td>\n",
526
              "      <td>0</td>\n",
527
              "      <td>https://www.reddit.com/r/india/comments/jk9zlt...</td>\n",
528
              "      <td>4</td>\n",
529
              "      <td>I wish to buy gelatine and am looking for the ...</td>\n",
530
              "      <td>csstudentG</td>\n",
531
              "      <td>It's available in your regular kirana stores....</td>\n",
532
              "      <td>AskIndia</td>\n",
533
              "      <td>2020-10-29 21:26:38</td>\n",
534
              "    </tr>\n",
535
              "    <tr>\n",
536
              "      <th>2</th>\n",
537
              "      <td>Trevor Noah's jokes during Indo-Pak tensions a...</td>\n",
538
              "      <td>gv9lmh</td>\n",
539
              "      <td>38</td>\n",
540
              "      <td>https://www.reddit.com/r/india/comments/gv9lmh...</td>\n",
541
              "      <td>29</td>\n",
542
              "      <td>I don't really watch much of Trevor Noah's lat...</td>\n",
543
              "      <td>CommYouNitty</td>\n",
544
              "      <td>It's mostly because we as a country don't rea...</td>\n",
545
              "      <td>AskIndia</td>\n",
546
              "      <td>2020-06-02 23:42:24</td>\n",
547
              "    </tr>\n",
548
              "    <tr>\n",
549
              "      <th>3</th>\n",
550
              "      <td>Need feedback for Insurance Policy that I took...</td>\n",
551
              "      <td>1s57oi</td>\n",
552
              "      <td>1</td>\n",
553
              "      <td>https://www.reddit.com/r/india/comments/1s57oi...</td>\n",
554
              "      <td>1</td>\n",
555
              "      <td>**Re-posting here because of lack of activity ...</td>\n",
556
              "      <td>dhavalcoholic</td>\n",
557
              "      <td>Dear Policy Holder(Dhavalcoholic),\\n \\nWe req...</td>\n",
558
              "      <td>AskIndia</td>\n",
559
              "      <td>2013-12-05 14:30:23</td>\n",
560
              "    </tr>\n",
561
              "    <tr>\n",
562
              "      <th>4</th>\n",
563
              "      <td>Buying used BS4 Scooty after April 1, Is there...</td>\n",
564
              "      <td>hpqq5o</td>\n",
565
              "      <td>12</td>\n",
566
              "      <td>https://www.reddit.com/r/india/comments/hpqq5o...</td>\n",
567
              "      <td>7</td>\n",
568
              "      <td>I don't know if it's right place to ask, but \"...</td>\n",
569
              "      <td>akza07</td>\n",
570
              "      <td>Scammers will keep scamming. \\n\\nIt applies t...</td>\n",
571
              "      <td>AskIndia</td>\n",
572
              "      <td>2020-07-12 15:58:15</td>\n",
573
              "    </tr>\n",
574
              "  </tbody>\n",
575
              "</table>\n",
576
              "</div>"
577
            ],
578
            "text/plain": [
579
              "                                               title  ...            timestamp\n",
580
              "0                            How to approach a girl?  ...  2020-11-25 19:50:12\n",
581
              "1                      Where is gelatine available ?  ...  2020-10-29 21:26:38\n",
582
              "2  Trevor Noah's jokes during Indo-Pak tensions a...  ...  2020-06-02 23:42:24\n",
583
              "3  Need feedback for Insurance Policy that I took...  ...  2013-12-05 14:30:23\n",
584
              "4  Buying used BS4 Scooty after April 1, Is there...  ...  2020-07-12 15:58:15\n",
585
              "\n",
586
              "[5 rows x 10 columns]"
587
            ]
588
          },
589
          "metadata": {
590
            "tags": []
591
          },
592
          "execution_count": 10
593
        }
594
      ]
595
    },
596
    {
597
      "cell_type": "code",
598
      "metadata": {
599
        "id": "POhV5u6UTvAg"
600
      },
601
      "source": [
602
        "def string(value):\n",
603
        "    return str(value)"
604
      ],
605
      "execution_count": 11,
606
      "outputs": []
607
    },
608
    {
609
      "cell_type": "code",
610
      "metadata": {
611
        "id": "vn4gCMiCT0j_"
612
      },
613
      "source": [
614
        "data['title'] = string(data['title'])\n",
615
        "data['body'] = string( data['body'])\n",
616
        "data['comments'] = string(data['comments'])"
617
      ],
618
      "execution_count": 12,
619
      "outputs": []
620
    },
621
    {
622
      "cell_type": "code",
623
      "metadata": {
624
        "id": "Tw1hectOT1sR"
625
      },
626
      "source": [
627
        "replace_by_space = re.compile('[/(){}\\[\\]\\|@,;]')\n",
628
        "bad_symbols = re.compile('[^0-9a-z #+_]')\n",
629
        "stopWords = set(stopwords.words('english'))\n",
630
        "def text_cleaning(text):\n",
631
        "   \n",
632
        "    text = BeautifulSoup(text, \"lxml\").text\n",
633
        "    text = text.lower()\n",
634
        "    text = replace_by_space.sub(' ', text)\n",
635
        "    text = bad_symbols.sub('', text)\n",
636
        "    text = ' '.join(word for word in text.split() if word not in stopWords)\n",
637
        "    return text"
638
      ],
639
      "execution_count": 13,
640
      "outputs": []
641
    },
642
    {
643
      "cell_type": "code",
644
      "metadata": {
645
        "id": "dqCNZYjXT4PM"
646
      },
647
      "source": [
648
        "data['title'] = data['title'].apply(text_cleaning)\n",
649
        "data['body'] = data['body'].apply(text_cleaning)\n",
650
        "data['comments'] = data['comments'].apply(text_cleaning)"
651
      ],
652
      "execution_count": 16,
653
      "outputs": []
654
    },
655
    {
656
      "cell_type": "code",
657
      "metadata": {
658
        "id": "p9avmBZKUBQG"
659
      },
660
      "source": [
661
        "combined_features = data[\"title\"] + data[\"comments\"] + data[\"url\"] + data[\"body\"]\n",
662
        "data = data.assign(combined_features = combined_features)"
663
      ],
664
      "execution_count": 17,
665
      "outputs": []
666
    },
667
    {
668
      "cell_type": "code",
669
      "metadata": {
670
        "id": "tuKJ0XluUDVI"
671
      },
672
      "source": [
673
        "data.to_csv('data.csv')"
674
      ],
675
      "execution_count": 18,
676
      "outputs": []
677
    },
678
    {
679
      "cell_type": "code",
680
      "metadata": {
681
        "colab": {
682
          "base_uri": "https://localhost:8080/",
683
          "height": 1000
684
        },
685
        "id": "1GuyCluSUGKr",
686
        "outputId": "9f93bb24-19f5-4d10-8131-3c7594a694cc"
687
      },
688
      "source": [
689
        "pd.read_csv('data.csv')"
690
      ],
691
      "execution_count": 19,
692
      "outputs": [
693
        {
694
          "output_type": "execute_result",
695
          "data": {
696
            "text/html": [
697
              "<div>\n",
698
              "<style scoped>\n",
699
              "    .dataframe tbody tr th:only-of-type {\n",
700
              "        vertical-align: middle;\n",
701
              "    }\n",
702
              "\n",
703
              "    .dataframe tbody tr th {\n",
704
              "        vertical-align: top;\n",
705
              "    }\n",
706
              "\n",
707
              "    .dataframe thead th {\n",
708
              "        text-align: right;\n",
709
              "    }\n",
710
              "</style>\n",
711
              "<table border=\"1\" class=\"dataframe\">\n",
712
              "  <thead>\n",
713
              "    <tr style=\"text-align: right;\">\n",
714
              "      <th></th>\n",
715
              "      <th>Unnamed: 0</th>\n",
716
              "      <th>title</th>\n",
717
              "      <th>id</th>\n",
718
              "      <th>score</th>\n",
719
              "      <th>url</th>\n",
720
              "      <th>num_of_comments</th>\n",
721
              "      <th>body</th>\n",
722
              "      <th>author</th>\n",
723
              "      <th>comments</th>\n",
724
              "      <th>flair</th>\n",
725
              "      <th>timestamp</th>\n",
726
              "      <th>combined_features</th>\n",
727
              "    </tr>\n",
728
              "  </thead>\n",
729
              "  <tbody>\n",
730
              "    <tr>\n",
731
              "      <th>0</th>\n",
732
              "      <td>0</td>\n",
733
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
734
              "      <td>k0qt2r</td>\n",
735
              "      <td>3</td>\n",
736
              "      <td>https://www.reddit.com/r/india/comments/k0qt2r...</td>\n",
737
              "      <td>15</td>\n",
738
              "      <td>0 2 years back working startup compan1 wish bu...</td>\n",
739
              "      <td>covidmanbun</td>\n",
740
              "      <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
741
              "      <td>AskIndia</td>\n",
742
              "      <td>2020-11-25 19:50:12</td>\n",
743
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
744
              "    </tr>\n",
745
              "    <tr>\n",
746
              "      <th>1</th>\n",
747
              "      <td>1</td>\n",
748
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
749
              "      <td>jk9zlt</td>\n",
750
              "      <td>0</td>\n",
751
              "      <td>https://www.reddit.com/r/india/comments/jk9zlt...</td>\n",
752
              "      <td>4</td>\n",
753
              "      <td>0 2 years back working startup compan1 wish bu...</td>\n",
754
              "      <td>csstudentG</td>\n",
755
              "      <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
756
              "      <td>AskIndia</td>\n",
757
              "      <td>2020-10-29 21:26:38</td>\n",
758
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
759
              "    </tr>\n",
760
              "    <tr>\n",
761
              "      <th>2</th>\n",
762
              "      <td>2</td>\n",
763
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
764
              "      <td>gv9lmh</td>\n",
765
              "      <td>38</td>\n",
766
              "      <td>https://www.reddit.com/r/india/comments/gv9lmh...</td>\n",
767
              "      <td>29</td>\n",
768
              "      <td>0 2 years back working startup compan1 wish bu...</td>\n",
769
              "      <td>CommYouNitty</td>\n",
770
              "      <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
771
              "      <td>AskIndia</td>\n",
772
              "      <td>2020-06-02 23:42:24</td>\n",
773
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
774
              "    </tr>\n",
775
              "    <tr>\n",
776
              "      <th>3</th>\n",
777
              "      <td>3</td>\n",
778
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
779
              "      <td>1s57oi</td>\n",
780
              "      <td>1</td>\n",
781
              "      <td>https://www.reddit.com/r/india/comments/1s57oi...</td>\n",
782
              "      <td>1</td>\n",
783
              "      <td>0 2 years back working startup compan1 wish bu...</td>\n",
784
              "      <td>dhavalcoholic</td>\n",
785
              "      <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
786
              "      <td>AskIndia</td>\n",
787
              "      <td>2013-12-05 14:30:23</td>\n",
788
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
789
              "    </tr>\n",
790
              "    <tr>\n",
791
              "      <th>4</th>\n",
792
              "      <td>4</td>\n",
793
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
794
              "      <td>hpqq5o</td>\n",
795
              "      <td>12</td>\n",
796
              "      <td>https://www.reddit.com/r/india/comments/hpqq5o...</td>\n",
797
              "      <td>7</td>\n",
798
              "      <td>0 2 years back working startup compan1 wish bu...</td>\n",
799
              "      <td>akza07</td>\n",
800
              "      <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
801
              "      <td>AskIndia</td>\n",
802
              "      <td>2020-07-12 15:58:15</td>\n",
803
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
804
              "    </tr>\n",
805
              "    <tr>\n",
806
              "      <th>...</th>\n",
807
              "      <td>...</td>\n",
808
              "      <td>...</td>\n",
809
              "      <td>...</td>\n",
810
              "      <td>...</td>\n",
811
              "      <td>...</td>\n",
812
              "      <td>...</td>\n",
813
              "      <td>...</td>\n",
814
              "      <td>...</td>\n",
815
              "      <td>...</td>\n",
816
              "      <td>...</td>\n",
817
              "      <td>...</td>\n",
818
              "      <td>...</td>\n",
819
              "    </tr>\n",
820
              "    <tr>\n",
821
              "      <th>1211</th>\n",
822
              "      <td>1211</td>\n",
823
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
824
              "      <td>j002jl</td>\n",
825
              "      <td>323</td>\n",
826
              "      <td>https://thewire.in/politics/bjp-bihar-election...</td>\n",
827
              "      <td>43</td>\n",
828
              "      <td>0 2 years back working startup compan1 wish bu...</td>\n",
829
              "      <td>mubukugrappa</td>\n",
830
              "      <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
831
              "      <td>Coronavirus</td>\n",
832
              "      <td>2020-09-26 13:23:26</td>\n",
833
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
834
              "    </tr>\n",
835
              "    <tr>\n",
836
              "      <th>1212</th>\n",
837
              "      <td>1212</td>\n",
838
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
839
              "      <td>j0wucw</td>\n",
840
              "      <td>0</td>\n",
841
              "      <td>https://www.reddit.com/r/india/comments/j0wucw...</td>\n",
842
              "      <td>0</td>\n",
843
              "      <td>0 2 years back working startup compan1 wish bu...</td>\n",
844
              "      <td>pizzapuff93</td>\n",
845
              "      <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
846
              "      <td>Coronavirus</td>\n",
847
              "      <td>2020-09-28 03:08:39</td>\n",
848
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
849
              "    </tr>\n",
850
              "    <tr>\n",
851
              "      <th>1213</th>\n",
852
              "      <td>1213</td>\n",
853
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
854
              "      <td>izx7aw</td>\n",
855
              "      <td>5</td>\n",
856
              "      <td>https://www.reddit.com/r/india/comments/izx7aw...</td>\n",
857
              "      <td>1</td>\n",
858
              "      <td>0 2 years back working startup compan1 wish bu...</td>\n",
859
              "      <td>spddgr8</td>\n",
860
              "      <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
861
              "      <td>Coronavirus</td>\n",
862
              "      <td>2020-09-26 10:12:50</td>\n",
863
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
864
              "    </tr>\n",
865
              "    <tr>\n",
866
              "      <th>1214</th>\n",
867
              "      <td>1214</td>\n",
868
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
869
              "      <td>ix6pl0</td>\n",
870
              "      <td>1020</td>\n",
871
              "      <td>https://www.reddit.com/gallery/ix6pl0</td>\n",
872
              "      <td>153</td>\n",
873
              "      <td>0 2 years back working startup compan1 wish bu...</td>\n",
874
              "      <td>IndianPuppy</td>\n",
875
              "      <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
876
              "      <td>Coronavirus</td>\n",
877
              "      <td>2020-09-22 02:55:01</td>\n",
878
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
879
              "    </tr>\n",
880
              "    <tr>\n",
881
              "      <th>1215</th>\n",
882
              "      <td>1215</td>\n",
883
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
884
              "      <td>iwafpe</td>\n",
885
              "      <td>289</td>\n",
886
              "      <td>https://theconversation.com/india-why-secrecy-...</td>\n",
887
              "      <td>9</td>\n",
888
              "      <td>0 2 years back working startup compan1 wish bu...</td>\n",
889
              "      <td>9kSs</td>\n",
890
              "      <td>0 stop watching indian movies nstop stalking1 ...</td>\n",
891
              "      <td>Coronavirus</td>\n",
892
              "      <td>2020-09-20 16:56:43</td>\n",
893
              "      <td>0 approach girl1 gelatine available 2 trevor n...</td>\n",
894
              "    </tr>\n",
895
              "  </tbody>\n",
896
              "</table>\n",
897
              "<p>1216 rows × 12 columns</p>\n",
898
              "</div>"
899
            ],
900
            "text/plain": [
901
              "      Unnamed: 0  ...                                  combined_features\n",
902
              "0              0  ...  0 approach girl1 gelatine available 2 trevor n...\n",
903
              "1              1  ...  0 approach girl1 gelatine available 2 trevor n...\n",
904
              "2              2  ...  0 approach girl1 gelatine available 2 trevor n...\n",
905
              "3              3  ...  0 approach girl1 gelatine available 2 trevor n...\n",
906
              "4              4  ...  0 approach girl1 gelatine available 2 trevor n...\n",
907
              "...          ...  ...                                                ...\n",
908
              "1211        1211  ...  0 approach girl1 gelatine available 2 trevor n...\n",
909
              "1212        1212  ...  0 approach girl1 gelatine available 2 trevor n...\n",
910
              "1213        1213  ...  0 approach girl1 gelatine available 2 trevor n...\n",
911
              "1214        1214  ...  0 approach girl1 gelatine available 2 trevor n...\n",
912
              "1215        1215  ...  0 approach girl1 gelatine available 2 trevor n...\n",
913
              "\n",
914
              "[1216 rows x 12 columns]"
915
            ]
916
          },
917
          "metadata": {
918
            "tags": []
919
          },
920
          "execution_count": 19
921
        }
922
      ]
923
    },
924
    {
925
      "cell_type": "code",
926
      "metadata": {
927
        "id": "9hUqQIICwtIt"
928
      },
929
      "source": [
930
        ""
931
      ],
932
      "execution_count": null,
933
      "outputs": []
934
    }
935
  ]
936
}
Amazing-Python-Scripts

Использование cookies