commit aefcfa85fa83c52680f26e5e0d8d37c60e3ccbab
Author: Askill <ofjokg@gmail.com>
Date:   Fri Oct 14 23:04:13 2022 +0200

    started optar

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e201d3a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+venv/**
+.idead
\ No newline at end of file
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..aa92679
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/../../../../:\projects\optar\.idea/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..03d9549
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
+  </profile>
+</component>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..6fc71a5
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (optar)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..27c3adc
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/optar.iml" filepath="$PROJECT_DIR$/.idea/optar.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/optar.iml b/.idea/optar.iml
new file mode 100644
index 0000000..85e816f
--- /dev/null
+++ b/.idea/optar.iml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.9 (optar)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/src/Crawler.py b/src/Crawler.py
new file mode 100644
index 0000000..67001bf
--- /dev/null
+++ b/src/Crawler.py
@@ -0,0 +1,95 @@
+import json
+from time import sleep
+from urllib.parse import urljoin
+from lxml import html
+import requests
+import logging
+
+
+class Crawler:
+    url = ""                # the url of the website to be checked
+    links = dict()          # dic. with all sites and urls on those sites
+    header_values = {
+        'Connection:': 'Keep-alive',
+        'name': 'Michael Foord',
+                'location': 'Northampton',
+                'language': 'English',
+                'User-Agent': 'Mozilla 4/0'}
+
+    exclude = [
+    ]
+
+    def __init__(self,  logger=None, exclude=None):
+        if exclude:
+            self.exclude += exclude
+        if logger:
+            self.logger = logger
+        else:
+            self.logger = logging.Logger(
+                name="star_crawler", level=logging.INFO)
+
+    def persist(self, path):
+        with open(path, 'w') as fp:
+            json.dump(self.links, fp)
+            
+    def load_site(self, path):
+        with open(path, 'r') as fp:
+            self.links = json.load(fp)
+            
+    def run(self, root, limit, sleep_time=0):
+        self.url = root
+        unchecked = [root]
+
+        while unchecked and len(self.links) < limit:
+            root = unchecked.pop()
+            if root in self.links or self.url.rsplit('/')[2] not in root:
+                continue
+            if "https" not in root:
+                continue
+
+            clean = False
+            for element in self.exclude:
+                if element in root:
+                    clean = False
+                    break
+                else:
+                    clean = True
+            if not clean:
+                continue
+
+            self.logger.info(f"{len(self.links)} {root}")
+            try:
+                site = requests.get(root)
+                tree = html.fromstring(site.content)
+                links = tree.xpath('//a/@href')
+            except:
+                continue
+
+            nlinks = []
+            for link in links:
+                if link not in nlinks:
+                    if link.startswith("http"):
+                        nlinks.append(link)
+                    else:
+                        nlinks.append(urljoin(site.url, link))
+
+            unchecked += nlinks
+            self.links[root] = nlinks
+            sleep(sleep_time)
+
+    def getNodesEdges(self):
+        nodes = []
+        edges = []
+        for key, value in self.links.items():
+            nodes.append(key)
+            for edge in value:
+                edges.append([key, edge])
+
+        return nodes, edges
+
+    def makeGraph(self, g):
+        nodes, edges = self.getNodesEdges()
+        for node in nodes:
+            g.add_node(node)
+        for f, t in edges:
+            g.add_edge(f, t)
diff --git a/src/SiteReader.py b/src/SiteReader.py
new file mode 100644
index 0000000..6af81bb
--- /dev/null
+++ b/src/SiteReader.py
@@ -0,0 +1,79 @@
+import json
+from typing import List, Dict
+import requests
+import trafilatura
+from requests.exceptions import MissingSchema
+from bs4 import BeautifulSoup
+
+
+class SiteReader:
+    def __init__(self):
+        pass
+
+    def beautifulsoup_extract_text_fallback(self, response_content):
+
+        '''
+        This is a fallback function, so that we can always return a value for text content.
+        Even for when both Trafilatura and BeautifulSoup are unable to extract the text from a
+        single URL.
+        '''
+
+        # Create the beautifulsoup object:
+        soup = BeautifulSoup(response_content, 'html.parser')
+
+        # Finding the text:
+        text = soup.find_all(text=True)
+
+        # Remove unwanted tag elements:
+        cleaned_text = ''
+        blacklist = [
+            '[document]',
+            'noscript',
+            'header',
+            'html',
+            'meta',
+            'head',
+            'input',
+            'script',
+            'style', ]
+
+        # Then we will loop over every item in the extract text and make sure that the beautifulsoup4 tag
+        # is NOT in the blacklist
+        for item in text:
+            if item.parent.name not in blacklist:
+                cleaned_text += '{} '.format(item)
+
+        # Remove any tab separation and strip the text:
+        cleaned_text = cleaned_text.replace('\t', '')
+        return cleaned_text.strip()
+
+    def extract_text_from_single_web_page(self, url):
+
+        downloaded_url = trafilatura.fetch_url(url)
+        try:
+            a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True, include_comments=False,
+                                    date_extraction_params={'extensive_search': True, 'original_date': True})
+        except AttributeError:
+            a = trafilatura.extract(downloaded_url, json_output=True, with_metadata=True,
+                                    date_extraction_params={'extensive_search': True, 'original_date': True})
+        if a:
+            json_output = json.loads(a)
+            return json_output['text']
+        else:
+            try:
+                resp = requests.get(url)
+                # We will only extract the text from successful requests:
+                if resp.status_code == 200:
+                    return self.beautifulsoup_extract_text_fallback(resp.content)
+                else:
+                    # This line will handle for any failures in both the Trafilature and BeautifulSoup4 functions:
+                    return None
+            # Handling for any URLs that don't have the correct protocol
+            except MissingSchema:
+                return None
+
+    def get_sites_content_dynamic(self, urls: List[str]):
+        pass
+
+    def get_sites_content_static(self, urls: List[str]) -> Dict[str, str]:
+        return {url: self.extract_text_from_single_web_page(url) for url in urls}
diff --git a/src/SiteStore.py b/src/SiteStore.py
new file mode 100644
index 0000000..f94aa84
--- /dev/null
+++ b/src/SiteStore.py
@@ -0,0 +1,15 @@
+import os
+from typing import List
+
+
+class SiteStore:
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def get_site_history(fqdn) -> List[str]:
+        cache_path = f"./cached/{fqdn}"
+        if not os.path.isdir(cache_path):
+               return [""]
+        return sorted(os.listdir(cache_path))
+
diff --git a/src/Watcher.py b/src/Watcher.py
new file mode 100644
index 0000000..afe7356
--- /dev/null
+++ b/src/Watcher.py
@@ -0,0 +1,46 @@
+
+from typing import List, Dict
+
+from src.SiteReader import SiteReader
+from src.SiteStore import SiteStore
+
+
+class Watcher:
+    def __init__(self) -> None:
+        self.site_store = SiteStore()
+        self.site_reader = SiteReader()
+        self.keywords_source_path = ""
+        self.sites_source_path = ""
+
+    def read_txt_file(self, path):
+        with open(path) as f:
+            return f.read().splitlines()
+
+    def watch(self):
+        while True:
+            keywords = self.read_txt_file(self.keywords_source_path)
+            sites = self.read_txt_file(self.sites_source_path)
+
+            contents = [self.get_new_content(site) for site in sites]
+            keywords = [x for x in self.get_new_content(keyword) for keyword in keywords]
+            matches = []
+            for url, content in contents.items():
+                matches.append(self.search_sites(url, content, keywords))
+            print(matches)
+        
+    def get_new_content(self, fqdm) -> List[str]:
+        """ get all past iterations of a site by the fully qualified domain name """
+        list_of_files = self.site_store.get_site_history(fqdm)
+        prev_version = list_of_files[-2]
+        current_version = list_of_files[-1]
+        news = dict(set(prev_version.items()) ^ set(current_version.items()))
+        sites_contents = self.site_reader.get_sites_content_static(sum(news.items()))
+
+        return sites_contents
+
+    def search_sites(self, url, content, keywords: List[str]):
+        results = []
+        for keyword in keywords:
+            if keyword in content.values():
+                results.append((url, keyword))
+        return results
\ No newline at end of file