I'm trying to extract several values from the website (https://carone.com.uy/autos-usados-y-0km?p=21). Some work fine, but some don't. For example, I am able to extract the name, model, price and fuel type, but cannot correctly extract the "Year" or "Kilometers" fields, the code always returns "N/A" as the value.
This is the code I'm using:
import pandas as pd
from datetime import date
import os
import socket
import requests
from bs4 import BeautifulSoup
def scrape_product_data(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
product_data = []
# Make the request to get the HTML content
response = requests.get(url, headers=headers)
response.raise_for_status() # Check if the request was successful
soup = BeautifulSoup(response.text, 'html.parser')
product_elements = soup.find_all('div', class_='product-item-info')
for product_element in product_elements:
# Extract product name, price, model, and attributes as before (same code as previous version)
product_name_element = product_element.select_one('p.carone-car-info-data-brand.cursor-pointer')
product_name = product_name_element.text.strip() if product_name_element else "N/A"
product_price_element = product_element.find('span', class_='price')
product_price = product_price_element.text.strip() if product_price_element else "N/A"
product_model_element = product_element.select_one('p.carone-car-info-data-model')
product_model = product_model_element.get('title').strip() if product_model_element else "N/A"
# Extract product attributes
attributes_div = product_element.find('div', class_='carone-car-attributes')
year_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Año')
year_value = year_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if year_element else "N/A"
kilometers_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Kilómetros')
kilometers_value = kilometers_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if kilometers_element else "N/A"
fuel_element = attributes_div.find('p', class_='carone-car-attribute-title', text='Combustible')
fuel_value = fuel_element.find_previous_sibling('p', class_='carone-car-attribute-value').text if fuel_element else "N/A"
# Append product data as a tuple (name, price, model, year, kilometers, fuel) to the list
product_data.append((product_name, product_price, product_model, year_value, kilometers_value, fuel_value))
The result looks like this: enter image description here
I don't understand why the mentioned value always gets "N/A" while the other ones work fine and the method is the same.
The problem is that instead of
Kilómetros, the site usesKilómetrosin the text of the element (the same goes for age):def scrape_product_data(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" } product_data = [] response = requests.get(url, headers=headers) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") product_elements = soup.find_all("div", class_="product-item-info") for product_element in product_elements: product_name_element = product_element.select_one( "p.carone-car-info-data-brand.cursor-pointer" ) product_name = ( product_name_element.text.strip() if product_name_element else "N/A" ) product_price_element = product_element.find("span", class_="price") product_price = ( product_price_element.text.strip() if product_price_element else "N/A" ) product_model_element = product_element.select_one( "p.carone-car-info-data-model" ) product_model = ( product_model_element.get("title").strip() if product_model_element else "N/A" ) attributes_div = product_element.find("div", class_="carone-car-attributes") year_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Año" ) year_value = ( year_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if year_element else "N/A" ) kilometers_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Kilómetros" ) kilometers_value = ( kilometers_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if kilometers_element else "N/A" ) fuel_element = attributes_div.find( "p", class_="carone-car-attribute-title", string="Combustible" ) fuel_value = ( fuel_element.find_previous_sibling( "p", class_="carone-car-attribute-value" ).text if fuel_element else "N/A" ) product_data.append( ( product_name, product_price, product_model, year_value, kilometers_value, fuel_value, ) ) return pd.DataFrame( product_data, columns=["Name", "Price", "Model", "Year", "KM", "Fuel"] ) df = scrape_product_data("https://carone.com.uy/autos-usados-y-0km?p=2") print(df)Print results: