helseveileder_webscraper/main.py

import httpx
from selectolax.parser import HTMLParser
import re
import csv
import config

def fetch_question_url(url):
    # Define headers
    headers = config.headers
    # Define the parameters for the request
    params = {
        "action": "get-faqs",
        "idSource": "2",
        "list": "frequently_asked_questions",
        "idCategory": "8288",  # Id for category
        "from": "0",
        "size": "100",  # Adjust this to get more results
        "gender": "false",
        "age": "false",
        "zone": "default",
        "filter_query": "",
        "skip": "0",
        "load_categories": "false"
    }

    response = httpx.get(url, params=params, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the response
        data = response.json()
        
        # Extracting URLs from each item
        urls = [item['url'] for item in data['items']]
        return urls
    else:
        print(f"Failed to fetch data: {response.status_code}")

def fetch_all_info(url):
    # Define headers
    headers = config.headers

    response = httpx.get(url, headers=headers)
    html = HTMLParser(response.text)

    # Extracting the question
    questions = html.css_first('div.article-text').text()
    
    # Extracting the answer and the signature
    answers = ""
    signature = ""
    specific_div = html.css_first('.article-text.font-serif.text-base.py-10')
    if specific_div:
        p_elements = specific_div.css('p')
        full_text = ' '.join([p.text() for p in p_elements])

        # Separate the signature using a regular expression
        signature_match = re.search(r'(Vennlig hilsen|Med vennlig hilsen|Mvh|Lykke til!)\s*(.*)', full_text, re.IGNORECASE)
        if signature_match:
            signature = signature_match.group(2)  # This is the signature text
            answers = full_text[:signature_match.start()]  # This is the text before the signature
        else:
            answers = full_text  # In case there is no signature

    # Clean up the signature
    cleaned_signature = re.sub(r'(Vennlig hilsen|Med vennlig hilsen|Mvh|Lykke til!)\s*', '', signature, flags=re.IGNORECASE)
    cleaned_signature = re.sub(r'^[\s,]*', '', cleaned_signature).strip()  # Remove leading commas and spaces
    
    # Extracting the additional metadata for gender and date
    metadata_div = html.css_first('div.pt-5.font-serif.text-light')
    if metadata_div:
        metadata_text = metadata_div.text(deep=True).strip()
        # Split the text to isolate gender and date
        parts = metadata_text.split('.')
        # Assuming gender and age are separated by a space and age is always a number
        gender_age = parts[0].split()
        gender = ' '.join([word for word in gender_age if not word.isdigit()]).strip()  # Remove the age part
        date = parts[1].strip() if len(parts) > 1 else "Date not found"
        metadata = gender + ", " + date
    else:
        metadata = "Metadata not found", "Metadata not found"

    # Create array of results
    result = [questions, answers, cleaned_signature, metadata]

    return result

studenterspor_url = "https://www.studenterspor.no/ajax_handler.php"
question_urls = fetch_question_url(studenterspor_url)

with open('studenterspor.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['Question', 'Answer', 'Signature', 'Metadata'])

    # Fetch info for each URL and write it to the CSV file
    for url in question_urls:
        info = fetch_all_info(url)
        writer.writerow(info)  # Write the question, answer, signature and metadata as a new row

print("Done! Your data is now in 'studenterspor.csv'")
:sparkles: project started, 200 OK received Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-05 11:13:58 +00:00			`import httpx`
:sparkles: spørsmål is scraped! Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:14:05 +00:00			`from selectolax.parser import HTMLParser`
:sparkles: answers now scraped! redid the signature-scraping aswell Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:45:44 +00:00			`import re`
:sparkles: export info to csv-file Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:03:42 +00:00			`import csv`
:art: add config file Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:14:05 +00:00			`import config`
:sparkles: project started, 200 OK received Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-05 11:13:58 +00:00
:sparkles: now receiving urls for each question Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 12:11:47 +00:00			`def fetch_question_url(url):`
			`# Define headers`
:art: add config file Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:14:05 +00:00			`headers = config.headers`
:sparkles: now receiving urls for each question Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 12:11:47 +00:00			`# Define the parameters for the request`
			`params = {`
			`"action": "get-faqs",`
			`"idSource": "2",`
			`"list": "frequently_asked_questions",`
			`"idCategory": "8288", # Id for category`
			`"from": "0",`
:sparkles: export info to csv-file Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:03:42 +00:00			`"size": "100", # Adjust this to get more results`
:sparkles: now receiving urls for each question Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 12:11:47 +00:00			`"gender": "false",`
			`"age": "false",`
			`"zone": "default",`
			`"filter_query": "",`
			`"skip": "0",`
			`"load_categories": "false"`
			`}`
:sparkles: project started, 200 OK received Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-05 11:13:58 +00:00
:sparkles: now receiving urls for each question Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 12:11:47 +00:00			`response = httpx.get(url, params=params, headers=headers)`

			`# Check if the request was successful`
			`if response.status_code == 200:`
			`# Parse the response`
			`data = response.json()`

			`# Extracting URLs from each item`
			`urls = [item['url'] for item in data['items']]`
			`return urls`
			`else:`
			`print(f"Failed to fetch data: {response.status_code}")`

:sparkles: spørsmål is scraped! Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:14:05 +00:00			`def fetch_all_info(url):`
			`# Define headers`
:art: add config file Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:14:05 +00:00			`headers = config.headers`
:sparkles: spørsmål is scraped! Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:14:05 +00:00
			`response = httpx.get(url, headers=headers)`
			`html = HTMLParser(response.text)`

:truck: rename variables to same language Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:54:36 +00:00			`# Extracting the question`
			`questions = html.css_first('div.article-text').text()`
:sparkles: spørsmål is scraped! Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:14:05 +00:00
:truck: rename variables to same language Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:54:36 +00:00			`# Extracting the answer and the signature`
			`answers = ""`
:sparkles: answers now scraped! redid the signature-scraping aswell Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:45:44 +00:00			`signature = ""`
			`specific_div = html.css_first('.article-text.font-serif.text-base.py-10')`
			`if specific_div:`
			`p_elements = specific_div.css('p')`
			`full_text = ' '.join([p.text() for p in p_elements])`

			`# Separate the signature using a regular expression`
			`signature_match = re.search(r'(Vennlig hilsen\|Med vennlig hilsen\|Mvh\|Lykke til!)\s(.)', full_text, re.IGNORECASE)`
			`if signature_match:`
			`signature = signature_match.group(2) # This is the signature text`
:truck: rename variables to same language Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:54:36 +00:00			`answers = full_text[:signature_match.start()] # This is the text before the signature`
:sparkles: signatures is now scraped! Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:15:49 +00:00			`else:`
:truck: rename variables to same language Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:54:36 +00:00			`answers = full_text # In case there is no signature`
:sparkles: answers now scraped! redid the signature-scraping aswell Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:45:44 +00:00
			`# Clean up the signature`
			`cleaned_signature = re.sub(r'(Vennlig hilsen\|Med vennlig hilsen\|Mvh\|Lykke til!)\s*', '', signature, flags=re.IGNORECASE)`
			`cleaned_signature = re.sub(r'^[\s,]*', '', cleaned_signature).strip() # Remove leading commas and spaces`
:sparkles: signatures is now scraped! Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:15:49 +00:00
:sparkles: metadata also collected Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:40:45 +00:00			`# Extracting the additional metadata for gender and date`
			`metadata_div = html.css_first('div.pt-5.font-serif.text-light')`
			`if metadata_div:`
			`metadata_text = metadata_div.text(deep=True).strip()`
			`# Split the text to isolate gender and date`
			`parts = metadata_text.split('.')`
			`# Assuming gender and age are separated by a space and age is always a number`
			`gender_age = parts[0].split()`
			`gender = ' '.join([word for word in gender_age if not word.isdigit()]).strip() # Remove the age part`
			`date = parts[1].strip() if len(parts) > 1 else "Date not found"`
			`metadata = gender + ", " + date`
			`else:`
			`metadata = "Metadata not found", "Metadata not found"`

:sparkles: spørsmål is scraped! Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:14:05 +00:00			`# Create array of results`
:sparkles: metadata also collected Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:40:45 +00:00			`result = [questions, answers, cleaned_signature, metadata]`
:sparkles: spørsmål is scraped! Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:14:05 +00:00
:sparkles: answers now scraped! redid the signature-scraping aswell Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:45:44 +00:00			`return result`
:sparkles: spørsmål is scraped! Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:14:05 +00:00
:sparkles: now receiving urls for each question Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 12:11:47 +00:00			`studenterspor_url = "https://www.studenterspor.no/ajax_handler.php"`
:truck: rename variable for easier understanding Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 13:21:44 +00:00			`question_urls = fetch_question_url(studenterspor_url)`
:sparkles: now receiving urls for each question Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 12:11:47 +00:00
:sparkles: export info to csv-file Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:03:42 +00:00			`with open('studenterspor.csv', mode='w', newline='', encoding='utf-8') as file:`
			`writer = csv.writer(file)`
			`# Write the header`
:sparkles: metadata also collected Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:40:45 +00:00			`writer.writerow(['Question', 'Answer', 'Signature', 'Metadata'])`
:sparkles: export info to csv-file Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:03:42 +00:00
			`# Fetch info for each URL and write it to the CSV file`
			`for url in question_urls:`
			`info = fetch_all_info(url)`
:sparkles: metadata also collected Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:40:45 +00:00			`writer.writerow(info) # Write the question, answer, signature and metadata as a new row`
:sparkles: export info to csv-file Co-authored-by: haraldnilsen <harald_998@hotmail.com> Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com> 2024-01-08 14:03:42 +00:00
			`print("Done! Your data is now in 'studenterspor.csv'")`