python
keywords
web scrapping
nltk
5 years, 4 months ago
from bs4 import BeautifulSoup
from bs4.element import Comment
from urllib.request import Request, urlopen
import csv
from nltk.corpus import stopwords
from nltk import word_tokenize
stop_words = set(stopwords.words('english'))
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
urls =['https://en.wikipedia.org/wiki/Natural_Language_Toolkit','https://medium.com/analytics-vidhya/web-scraping-wiki-tables-using-beautifulsoup-and-python-6b9ea26d8722']
def text_from_html(body):
soup = BeautifulSoup(body, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
for url in urls:
try:
html = urlopen(url).read()
except:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
text = text_from_html(html)
tokens = word_tokenize(text)
result = [i for i in tokens if not i in stop_words]
print(result)
0 Comments
Please Login to Comment Here