helseveileder_webscraper/main.py

import httpx
from selectolax.parser import HTMLParser
import re
import csv

def fetch_question_url(url):
    # Define headers
    headers = {
            "User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
    }

    # Define the parameters for the request
    params = {
        "action": "get-faqs",
        "idSource": "2",
        "list": "frequently_asked_questions",
        "idCategory": "8288",  # Id for category
        "from": "0",
        "size": "100",  # Adjust this to get more results
        "gender": "false",
        "age": "false",
        "zone": "default",
        "filter_query": "",
        "skip": "0",
        "load_categories": "false"
    }

    response = httpx.get(url, params=params, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the response
        data = response.json()

        # Extracting URLs from each item
        urls = [item['url'] for item in data['items']]
        return urls
    else:
        print(f"Failed to fetch data: {response.status_code}")

def fetch_all_info(url):
    # Define headers
    headers = {
            "User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
    }

    response = httpx.get(url, headers=headers)
    html = HTMLParser(response.text)

    # Extracting the question
    questions = html.css_first('div.article-text').text()

    # Extracting the answer and the signature
    answers = ""
    signature = ""
    specific_div = html.css_first('.article-text.font-serif.text-base.py-10')
    if specific_div:
        p_elements = specific_div.css('p')
        full_text = ' '.join([p.text() for p in p_elements])

        # Separate the signature using a regular expression
        signature_match = re.search(r'(Vennlig hilsen|Med vennlig hilsen|Mvh|Lykke til!)\s*(.*)', full_text, re.IGNORECASE)
        if signature_match:
            signature = signature_match.group(2)  # This is the signature text
            answers = full_text[:signature_match.start()]  # This is the text before the signature
        else:
            answers = full_text  # In case there is no signature

    # Clean up the signature
    cleaned_signature = re.sub(r'(Vennlig hilsen|Med vennlig hilsen|Mvh|Lykke til!)\s*', '', signature, flags=re.IGNORECASE)
    cleaned_signature = re.sub(r'^[\s,]*', '', cleaned_signature).strip()  # Remove leading commas and spaces

    # Create array of results
    result = [questions, answers, cleaned_signature]

    return result

studenterspor_url = "https://www.studenterspor.no/ajax_handler.php"
question_urls = fetch_question_url(studenterspor_url)

# Open a new CSV file to write the data
with open('studenterspor.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['Question', 'Answer', 'Signature'])

    # Fetch info for each URL and write it to the CSV file
    for url in question_urls:
        info = fetch_all_info(url)
        writer.writerow(info)  # Write the question, answer, and signature as a new row

print("Done! Your data is now in 'studenterspor.csv'")