Buzzfeed-Clickbait/scrape.py

import asyncio
import csv
import aiohttp


from lxml import html

header_values = {
    'name': 'Michael Foord',
    'location': 'Northampton',
    'language': 'English',
    'User-Agent': 'Mozilla 4/0',
    'Accept-Encoding': 'gzip',
    'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
    'Upgrade-Insecure-Requests': '0',
    'Referrer': 'https://www.google.com/'
}

def get_links():
    links = []
    base = "https://www.buzzfeed.com/archive/" # + y/m/d
    start = 2006
    end = 2023
    for year in range(start, end+1):
        for month in range(1, 13):
            for day in range(1, 32):
                links.append(base + f"{year}/{month}/{day}")
    return links

async def get_content_from_link(session, link):
    def texts_from_html_elements(elements):
        return [x.strip() for x in elements]

    try:
  
        async with session.get(link) as response:
            
            tree = html.fromstring(await response.text())

            title_path = '//div[2]/div/h2/a/text()[normalize-space()]'
            link_path = '//div[2]/div/h2/a/@href'
            desc_path = '//div[2]/div/p/text()[normalize-space()]'
            author_path = '//div[3]/div/div/a/span/text()[normalize-space()]'

            titles = tree.xpath(title_path)
            links = tree.xpath(link_path)
            descs = texts_from_html_elements(tree.xpath(desc_path))
            authors = texts_from_html_elements(tree.xpath(author_path))
            link_comp = link.split("/")
            date = link_comp[-3] + "/" + link_comp[-2] + "/" + link_comp[-1]
            print(date)
            return list(zip([date]*len(titles), range(0, len(titles)), titles, links, descs, authors))
    except:
        print("unable to get ", link.split(".com"[-1]))
        return []

async def get_content_from_links(links):
    contents = []
    async with aiohttp.ClientSession() as session:
        contents = await asyncio.gather(*[get_content_from_link(session, link) for link in links])
    if contents is not None:
        return [item for row in contents for item in row]
    else:
        return []
def main():
    links = get_links()
    x = asyncio.get_event_loop().run_until_complete(get_content_from_links(links))
    with open('./csv_file.csv', 'w', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["date", "index", "titles", "links", "descs", "authors"])
        writer.writerows(x)
    print(x)


if __name__ == "__main__":
    main()
init 2023-07-24 21:12:15 +00:00			`import asyncio`
			`import csv`
			`import aiohttp`


			`from lxml import html`

			`header_values = {`
			`'name': 'Michael Foord',`
			`'location': 'Northampton',`
			`'language': 'English',`
			`'User-Agent': 'Mozilla 4/0',`
			`'Accept-Encoding': 'gzip',`
			`'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',`
			`'Upgrade-Insecure-Requests': '0',`
			`'Referrer': 'https://www.google.com/'`
			`}`

			`def get_links():`
			`links = []`
			`base = "https://www.buzzfeed.com/archive/" # + y/m/d`
			`start = 2006`
			`end = 2023`
			`for year in range(start, end+1):`
			`for month in range(1, 13):`
			`for day in range(1, 32):`
			`links.append(base + f"{year}/{month}/{day}")`
			`return links`

			`async def get_content_from_link(session, link):`
			`def texts_from_html_elements(elements):`
			`return [x.strip() for x in elements]`

			`try:`

			`async with session.get(link) as response:`

			`tree = html.fromstring(await response.text())`

			`title_path = '//div[2]/div/h2/a/text()[normalize-space()]'`
			`link_path = '//div[2]/div/h2/a/@href'`
			`desc_path = '//div[2]/div/p/text()[normalize-space()]'`
			`author_path = '//div[3]/div/div/a/span/text()[normalize-space()]'`

			`titles = tree.xpath(title_path)`
			`links = tree.xpath(link_path)`
			`descs = texts_from_html_elements(tree.xpath(desc_path))`
			`authors = texts_from_html_elements(tree.xpath(author_path))`
			`link_comp = link.split("/")`
			`date = link_comp[-3] + "/" + link_comp[-2] + "/" + link_comp[-1]`
			`print(date)`
			`return list(zip([date]*len(titles), range(0, len(titles)), titles, links, descs, authors))`
			`except:`
			`print("unable to get ", link.split(".com"[-1]))`
			`return []`

			`async def get_content_from_links(links):`
			`contents = []`
			`async with aiohttp.ClientSession() as session:`
			`contents = await asyncio.gather(*[get_content_from_link(session, link) for link in links])`
			`if contents is not None:`
			`return [item for row in contents for item in row]`
			`else:`
			`return []`
			`def main():`
			`links = get_links()`
			`x = asyncio.get_event_loop().run_until_complete(get_content_from_links(links))`
			`with open('./csv_file.csv', 'w', encoding="utf-8") as f:`
			`writer = csv.writer(f)`
			`writer.writerow(["date", "index", "titles", "links", "descs", "authors"])`
			`writer.writerows(x)`
			`print(x)`


			`if __name__ == "__main__":`
			`main()`