from urllib.parse import urljoin
from lxml import html
import requests
import json
from time import sleep
import random


def getLinks():
    header_values = {
        'name': 'Michael Foord',
        'location': 'Northampton',
        'language': 'English',
        'User-Agent': 'Mozilla 4/0',
        'Accept-Encoding': 'gzip',
        'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
        'Upgrade-Insecure-Requests': '0',
        'Referrer': 'https://www.google.com/'
    }

    links = []
    with requests.Session() as session:
        root = "https://www.chefkoch.de/rs/s0/Rezepte.html"
        site = session.get(root,  headers=header_values)
        tree = html.fromstring(site.content)

        # converts: 344.621 Ergebnisse to int(344621)
        max = int(tree.xpath(
            '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))

        for i in range(0, max, 30):
            try:
                root = "https://www.chefkoch.de/rs/s" + \
                    str(i) + "/Rezepte.html"
                site = session.get(root,  headers=header_values)
                tree = html.fromstring(site.content)

                # converts: 344.621 Ergebnisse to int(344621)
                max = int(tree.xpath(
                    '/html/body/main/div[1]/h1/span/text()')[0].split(" ")[0].replace(".", ""))
                # only add new links
                for x in tree.xpath('/html/body/main/article/a/@href'):
                    if x not in links:
                        links.append(x)
                print(i)

            except Exception as e:
                # retry after 3 seconds
                print(e)
                i -= 30
                sleep(10)

            sleep(random.randint(0, 5))

        print(links)
    return links


links = getLinks()
with open('./data/links.json', 'w') as file:
    jsonString = json.dumps(links)
    file.write(jsonString)