This repository has been archived on 2024-12-13. You can view files and clone it, but cannot push or open issues or pull requests.
helseveileder_webscraper/main.py
Sindre Kjelsrud 709937da9d
export info to csv-file
Co-authored-by: haraldnilsen <harald_998@hotmail.com>
Signed-off-by: Sindre Kjelsrud <kjelsrudsindre@gmail.com>
2024-01-08 15:03:42 +01:00

92 lines
No EOL
3.1 KiB
Python

import httpx
from selectolax.parser import HTMLParser
import re
import csv
def fetch_question_url(url):
# Define headers
headers = {
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
}
# Define the parameters for the request
params = {
"action": "get-faqs",
"idSource": "2",
"list": "frequently_asked_questions",
"idCategory": "8288", # Id for category
"from": "0",
"size": "100", # Adjust this to get more results
"gender": "false",
"age": "false",
"zone": "default",
"filter_query": "",
"skip": "0",
"load_categories": "false"
}
response = httpx.get(url, params=params, headers=headers)
# Check if the request was successful
if response.status_code == 200:
# Parse the response
data = response.json()
# Extracting URLs from each item
urls = [item['url'] for item in data['items']]
return urls
else:
print(f"Failed to fetch data: {response.status_code}")
def fetch_all_info(url):
# Define headers
headers = {
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
}
response = httpx.get(url, headers=headers)
html = HTMLParser(response.text)
# Extracting the question
questions = html.css_first('div.article-text').text()
# Extracting the answer and the signature
answers = ""
signature = ""
specific_div = html.css_first('.article-text.font-serif.text-base.py-10')
if specific_div:
p_elements = specific_div.css('p')
full_text = ' '.join([p.text() for p in p_elements])
# Separate the signature using a regular expression
signature_match = re.search(r'(Vennlig hilsen|Med vennlig hilsen|Mvh|Lykke til!)\s*(.*)', full_text, re.IGNORECASE)
if signature_match:
signature = signature_match.group(2) # This is the signature text
answers = full_text[:signature_match.start()] # This is the text before the signature
else:
answers = full_text # In case there is no signature
# Clean up the signature
cleaned_signature = re.sub(r'(Vennlig hilsen|Med vennlig hilsen|Mvh|Lykke til!)\s*', '', signature, flags=re.IGNORECASE)
cleaned_signature = re.sub(r'^[\s,]*', '', cleaned_signature).strip() # Remove leading commas and spaces
# Create array of results
result = [questions, answers, cleaned_signature]
return result
studenterspor_url = "https://www.studenterspor.no/ajax_handler.php"
question_urls = fetch_question_url(studenterspor_url)
# Open a new CSV file to write the data
with open('studenterspor.csv', mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Write the header
writer.writerow(['Question', 'Answer', 'Signature'])
# Fetch info for each URL and write it to the CSV file
for url in question_urls:
info = fetch_all_info(url)
writer.writerow(info) # Write the question, answer, and signature as a new row
print("Done! Your data is now in 'studenterspor.csv'")