From 0da7504104b6bc5e1ebb3dd0d6ebfc6f721b5e39 Mon Sep 17 00:00:00 2001 From: Sindre Kjelsrud Date: Mon, 8 Jan 2024 14:14:05 +0100 Subject: [PATCH] =?UTF-8?q?:sparkles:=20sp=C3=B8rsm=C3=A5l=20is=20scraped!?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: haraldnilsen Signed-off-by: Sindre Kjelsrud --- main.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index d282f1d..4af8063 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,5 @@ import httpx -import json +from selectolax.parser import HTMLParser def fetch_question_url(url): # Define headers @@ -36,9 +36,25 @@ def fetch_question_url(url): else: print(f"Failed to fetch data: {response.status_code}") +def fetch_all_info(url): + # Define headers + headers = { + "User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0" + } + + response = httpx.get(url, headers=headers) + html = HTMLParser(response.text) + + # Extracting the question and answer + sporsmal = html.css_first('div.article-text').text() + + # Create array of results + #result = [sporsmal, svar, signatur] + + return sporsmal + studenterspor_url = "https://www.studenterspor.no/ajax_handler.php" urls = fetch_question_url(studenterspor_url) -if urls: - for url in urls: - print(url) \ No newline at end of file +for url in urls: + print(fetch_all_info(url)) \ No newline at end of file