84 lines
2.7 KiB
Python
84 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
|
|
|
|
|
import sys
|
|
import datetime
|
|
from bs4 import BeautifulSoup
|
|
from urllib.request import urlopen
|
|
|
|
|
|
MAX_FILE_NAME = 120 # Max length for post file names
|
|
DATE_FORMAT = 'en' # Put 'fr' here if your dates fit '%d/%m/%Y' format
|
|
|
|
|
|
class WeeblyScraper():
|
|
def scrape(self, url, nbPages):
|
|
if not url.startswith('http://') and \
|
|
not url.startswith('https://'):
|
|
|
|
url = 'http://' + url
|
|
|
|
if not url.endswith('/'):
|
|
url += '/'
|
|
|
|
post_urls = []
|
|
|
|
for pageNumber in range(1, abs(int(nbPages)) + 1):
|
|
address = url + 'previous/' + str(pageNumber)
|
|
print("Scraping page (" + address + ')')
|
|
|
|
try:
|
|
url = urlopen(address)
|
|
soup = BeautifulSoup(url.read(), 'html.parser')
|
|
|
|
if soup.find(id='blogTable'):
|
|
for post in soup.findAll('a', {
|
|
'class': 'blog-title-link'
|
|
}):
|
|
post_urls.append(post.get('href'))
|
|
|
|
except:
|
|
print("Page not found, you don\'t have that many posts"
|
|
" (don\'t forget to check your Internet connection ?).")
|
|
|
|
for post_url in post_urls:
|
|
soup = BeautifulSoup(urlopen(post_url).read(), 'html.parser')
|
|
|
|
title = soup.findAll('a', {
|
|
'class': 'blog-title-link'
|
|
})[0].get_text()
|
|
title = title.replace('\"', '\\\"')
|
|
|
|
date = soup.findAll('p', {
|
|
'class': 'blog-date'
|
|
})[0].get_text().strip()
|
|
date = datetime.datetime.strptime(date,
|
|
'%d/%m/%Y' if DATE_FORMAT == 'fr'
|
|
else '%m/%d/%Y'
|
|
).strftime('%Y-%m-%d')
|
|
|
|
content = soup.findAll('div', {
|
|
'class': 'blog-content'
|
|
})[0].prettify()
|
|
|
|
filename = post_url.replace(url, '') \
|
|
.replace('articles/', '')[:MAX_FILE_NAME]
|
|
|
|
print("... writing " + date + '-' + filename + '.md')
|
|
|
|
with open('content/' + date + '-' + filename + '.md', 'w+') as f:
|
|
f.write('---\ntitle: ' + '\"' + title + '\"'
|
|
+ '\ndate: ' + date
|
|
+ "\nurl: " + post_url.replace(url, '')
|
|
+ '\n---\n\n' + content)
|
|
f.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 3:
|
|
print("\nUsage: `python3 " + sys.argv[0] + " <Weebly URL posts\' page>"
|
|
"<Number of pages containing posts>`\n")
|
|
|
|
else:
|
|
WeeblyScraper().scrape(sys.argv[1], sys.argv[2])
|