import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import pandas as pd
username = 'TU NombreDeUsuario de Nstproxy'
password = 'Tu contraseña'
host = 'gate.nstproxy.io'
port = '24125'
proxy = f'http://{username}:{password}@{host}:{port}'
proxies = {
"http": proxy,
"https": proxy
}
custom_headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, como Gecko) Chrome/138.0.0.0 Safari/537.36',
'Accept-Language': 'da, en-gb, en',
'Accept-Encoding': 'gzip, deflate, br',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.7',
'Referer': 'https://www.google.com/'
}
def parse_listing(listing_url, visited_urls, current_page=1, max_pages=2):
resp = requests.get(
listing_url, headers=custom_headers, proxies=proxies
)
print(resp.status_code)
soup_search = BeautifulSoup(resp.text, 'lxml')
link_elements = soup_search.select(
'[data-cy="title-recipe"] > a.a-link-normal'
)
page_data = []
for link in link_elements:
full_url = urljoin(listing_url, link.attrs.get('href'))
if full_url not in visited_urls:
visited_urls.add(full_url)
print(f'Raspar producto de {full_url[:100]}', flush=True)
product_info = get_product_info(full_url)
if product_info:
page_data.append(product_info)
time.sleep(random.uniform(3, 7))
next_page_el = soup_search.select_one('a.s-pagination-next')
if next_page_el and current_page < max_pages:
next_page_url = next_page_el.attrs.get('href')
next_page_url = urljoin(listing_url, next_page_url)
print(
f'Raspar siguiente página: {next_page_url}'
f'(Página {current_page + 1} de {max_pages})',
flush=True
)
page_data += parse_listing(
next_page_url, visited_urls, current_page + 1, max_pages
)```
```python
return page_data
def obtener_info_producto(url):
resp = requests.get(url, headers=custom_headers, proxies=proxies)
if resp.status_code != 200:
print(f'Error al obtener la página web: {url}')
return None
soup = BeautifulSoup(resp.text, 'lxml')
elemento_titulo = soup.select_one('#productTitle')
titulo = elemento_titulo.text.strip() if elemento_titulo else None
precio_e = soup.select_one('#corePrice_feature_div span.a-offscreen')
precio = precio_e.text if precio_e else None
elemento_calificacion = soup.select_one('#acrPopover')
texto_calificacion = elemento_calificacion.attrs.get('title') if elemento_calificacion else None
calificacion = texto_calificacion.replace('de 5 estrellas', '') if texto_calificacion else None
elemento_imagen = soup.select_one('#landingImage')
imagen = elemento_imagen.attrs.get('src') if elemento_imagen else None
elemento_descripcion = soup.select_one(
'#productDescription, #feature-bullets > ul'
)
descripcion = (
elemento_descripcion.text.strip() if elemento_descripcion else None
)
return {
'titulo': titulo,
'precio': precio,
'calificacion': calificacion,
'imagen': imagen,
'descripcion': descripcion,
'url': url
}
def main():
urls_visitadas = set()
url_busqueda = 'https://www.amazon.com/s?k=apple'
data = parse_listing(url_busqueda, urls_visitadas)
df = pd.DataFrame(data)
df.to_csv('manzana.csv', index=False)
if __name__ == '__main__':
main()