This repository has been archived on 2019-04-23. You can view files and clone it, but cannot push or open issues or pull requests.
Weebly-Scraper/weebly-scraper.py

84 lines
2.7 KiB
Python

#!/usr/bin/env python3
import sys
import datetime
from bs4 import BeautifulSoup
from urllib.request import urlopen
MAX_FILE_NAME = 120 # Max length for post file names
DATE_FORMAT = 'en' # Put 'fr' here if your dates fit '%d/%m/%Y' format
class WeeblyScraper():
def scrape(self, url, nbPages):
if not url.startswith('http://') and \
not url.startswith('https://'):
url = 'http://' + url
if not url.endswith('/'):
url += '/'
post_urls = []
for pageNumber in range(1, abs(int(nbPages)) + 1):
address = url + 'previous/' + str(pageNumber)
print("Scraping page (" + address + ')')
try:
url = urlopen(address)
soup = BeautifulSoup(url.read(), 'html.parser')
if soup.find(id='blogTable'):
for post in soup.findAll('a', {
'class': 'blog-title-link'
}):
post_urls.append(post.get('href'))
except:
print("Page not found, you don\'t have that many posts"
" (don\'t forget to check your Internet connection ?).")
for post_url in post_urls:
soup = BeautifulSoup(urlopen(post_url).read(), 'html.parser')
title = soup.findAll('a', {
'class': 'blog-title-link'
})[0].get_text()
title = title.replace('\"', '\\\"')
date = soup.findAll('p', {
'class': 'blog-date'
})[0].get_text().strip()
date = datetime.datetime.strptime(date,
'%d/%m/%Y' if DATE_FORMAT == 'fr'
else '%m/%d/%Y'
).strftime('%Y-%m-%d')
content = soup.findAll('div', {
'class': 'blog-content'
})[0].prettify()
filename = post_url.replace(url, '') \
.replace('articles/', '')[:MAX_FILE_NAME]
print("... writing " + date + '-' + filename + '.md')
with open('content/' + date + '-' + filename + '.md', 'w+') as f:
f.write('---\ntitle: ' + '\"' + title + '\"'
+ '\ndate: ' + date
+ "\nurl: " + post_url.replace(url, '')
+ '\n---\n\n' + content)
f.close()
if __name__ == '__main__':
if len(sys.argv) != 3:
print("\nUsage: `python3 " + sys.argv[0] + " <Weebly URL posts\' page>"
"<Number of pages containing posts>`\n")
else:
WeeblyScraper().scrape(sys.argv[1], sys.argv[2])