answers now scraped!

redid the signature-scraping aswell

Co-authored-by: haraldnilsen <harald_998@hotmail.com>
Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com>
This commit is contained in:
Sindre Kjelsrud 2024-01-08 14:45:44 +01:00
parent 23520a5ac1
commit 93c4464080
Signed by untrusted user who does not match committer: sidski
GPG key ID: D2BBDF3EDE6BA9A6

38
main.py
View file

@ -1,5 +1,6 @@
import httpx import httpx
from selectolax.parser import HTMLParser from selectolax.parser import HTMLParser
import re
def fetch_question_url(url): def fetch_question_url(url):
# Define headers # Define headers
@ -45,28 +46,33 @@ def fetch_all_info(url):
response = httpx.get(url, headers=headers) response = httpx.get(url, headers=headers)
html = HTMLParser(response.text) html = HTMLParser(response.text)
# Extracting the question and answer # Extracting the question "sporsmal"
sporsmal = html.css_first('div.article-text').text() sporsmal = html.css_first('div.article-text').text()
# Check if the element exists before accessing its text # Extracting the answer "svar" and the signature "signature"
signatur_div = html.css_first('div.signatur') svar = ""
signatur = "" signature = ""
if signatur_div: specific_div = html.css_first('.article-text.font-serif.text-base.py-10')
signaturTxt = signatur_div.text() if specific_div:
# Remove "Med vennlig hilsen" or "Vennlig hilsen" p_elements = specific_div.css('p')
signaturTxt = signaturTxt.replace("Med vennlig hilsen", "").replace("Vennlig hilsen", "").strip() full_text = ' '.join([p.text() for p in p_elements])
# Check if the text is empty after removal
if signaturTxt: # Separate the signature using a regular expression
signatur = signaturTxt signature_match = re.search(r'(Vennlig hilsen|Med vennlig hilsen|Mvh|Lykke til!)\s*(.*)', full_text, re.IGNORECASE)
if signature_match:
signature = signature_match.group(2) # This is the signature text
svar = full_text[:signature_match.start()] # This is the text before the signature
else: else:
signatur = "Null signatur" svar = full_text # In case there is no signature
else:
signatur = "Null signatur" # Clean up the signature
cleaned_signature = re.sub(r'(Vennlig hilsen|Med vennlig hilsen|Mvh|Lykke til!)\s*', '', signature, flags=re.IGNORECASE)
cleaned_signature = re.sub(r'^[\s,]*', '', cleaned_signature).strip() # Remove leading commas and spaces
# Create array of results # Create array of results
#result = [sporsmal, svar, signatur] result = [sporsmal, svar, cleaned_signature]
return signatur return result
studenterspor_url = "https://www.studenterspor.no/ajax_handler.php" studenterspor_url = "https://www.studenterspor.no/ajax_handler.php"
question_urls = fetch_question_url(studenterspor_url) question_urls = fetch_question_url(studenterspor_url)