Buzzfeed-Clickbait/scrape.py

import asyncio
import csv
import aiohttp


from lxml import html

header_values = {
    'name': 'Michael Foord',
    'location': 'Northampton',
    'language': 'English',
    'User-Agent': 'Mozilla 4/0',
    'Accept-Encoding': 'gzip',
    'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
    'Upgrade-Insecure-Requests': '0',
    'Referrer': 'https://www.google.com/'
}

def get_links():
    links = []
    base = "https://www.buzzfeed.com/archive/" # + y/m/d
    start = 2006
    end = 2023
    for year in range(start, end+1):
        for month in range(1, 13):
            for day in range(1, 32):
                links.append(base + f"{year}/{month}/{day}")
    return links

async def get_content_from_link(session, link):
    def texts_from_html_elements(elements):
        return [x.strip() for x in elements]

    try:

        async with session.get(link) as response:

            tree = html.fromstring(await response.text())

            title_path = '//div[2]/div/h2/a/text()[normalize-space()]'
            link_path = '//div[2]/div/h2/a/@href'
            desc_path = '//div[2]/div/p/text()[normalize-space()]'
            author_path = '//div[3]/div/div/a/span/text()[normalize-space()]'

            titles = tree.xpath(title_path)
            links = tree.xpath(link_path)
            descs = texts_from_html_elements(tree.xpath(desc_path))
            authors = texts_from_html_elements(tree.xpath(author_path))
            link_comp = link.split("/")
            date = link_comp[-3] + "/" + link_comp[-2] + "/" + link_comp[-1]
            print(date)
            return list(zip([date]*len(titles), range(0, len(titles)), titles, links, descs, authors))
    except:
        print("unable to get ", link.split(".com"[-1]))
        return []

async def get_content_from_links(links):
    contents = []
    async with aiohttp.ClientSession() as session:
        contents = await asyncio.gather(*[get_content_from_link(session, link) for link in links])
    if contents is not None:
        return [item for row in contents for item in row]
    else:
        return []
def main():
    links = get_links()
    x = asyncio.get_event_loop().run_until_complete(get_content_from_links(links))
    with open('./csv_file.csv', 'w', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["date", "index", "titles", "links", "descs", "authors"])
        writer.writerows(x)
    print(x)


if __name__ == "__main__":
    main()