1

I have this code and I wanna extract holidays, petrol and temperature but I don't know where is the problem. I need your help as soon as possible, please. I want to add this extraction to my dataset that is based on date columns, so comparing the scraping data with the dates that I have in my dataset. I also wanna test the impact of each variable holidays, temperature...

import requests
import re
import json
import datefinder
from googletrans import Translator
import datetime

def web_scraping(user_data, dateColumn, country , weather=False, holidays=True, petrole=False) :

start_time = time.time()
df = user_data.copy()


if holidays : 

    ## suppose date column is converted to datetime

    print("")
    print("Adding holidays data ......")
    translator = Translator()
    country_en = translator.translate(country, dest='en').text.lower()

    url = f'https://www.timeanddate.com/holidays/{country_en}/'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    holidays = []
    for i in range(4,len(soup.find_all('th'))) :
        holidays.append(soup.find_all('th')[i].text)

    j=0
    for date in holidays :
        r = datefinder.find_dates(date)
        for _ in r :
            holidays[j] = _

        j+=1

    holidays_df = pd.DataFrame({'holiday date': np.asanyarray(holidays)})
    holidays_df['holiday_month'] = holidays_df['holiday date'].apply(lambda x: x.month)
    holidays_df['holiday_day'] = holidays_df['holiday date'].apply(lambda x: x.day)

    df['is_holiday'] = df[dateColumn]
    for i in range(0, df.shape[0]) :
        for j in range(0, holidays_df.shape[0]) : 
            if (df.loc[i, dateColumn].month == holidays_df.loc[j, 'holiday_month']) & (df.loc[i, dateColumn].day == holidays_df.loc[j, 'holiday_day']) :
                df.loc[i, 'is_holiday'] = 1
            else :
                df.loc[i, 'is_holiday'] = 0


if weather : 

    print("")
    print("Adding weather data ......")

    #url = 'https://www.wunderground.com/history/daily/ma/nouaceur/GMMN/date/2008-3-24'
    #page = requests.get(url)
    #soup = BeautifulSoup(page.content, 'html.parser')


    df['temp_moy'] = df[dateColumn]
    scrap_months = list(np.arange(1,384,32))
    for i in range(0,df.shape[0]) : 

        year_data = str(df.loc[i, dateColumn].year)
        month_data = df.loc[i, dateColumn].month


        if (int(year_data) >= 2009) :

            url = f'https://www.historique-meteo.net/afrique/{country.lower()}/{year_data}/'
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')
            df.loc[i, 'temp_moy'] = soup.find_all('td')[scrap_months[month_data - 1]].text[:2] 

        else :

            df.loc[i, 'temp_moy'] = np.nan
        print("Progress :  ", i, '%')


if petrole : 

    print("")
    print("Adding petrol data ......")
    df['petrole_USD'] = df[dateColumn]
    for i in range(0, df.shape[0]) : 


        ##convert to posix date integrable in url 

        cur_date = df.loc[i, dateColumn]
        cur_date_unix = int(time.mktime(cur_date.timetuple()))
        next_date = cur_date + datetime.timedelta(days=1)
        next_date_unix = int(time.mktime(next_date.timetuple()))

        ## dynamic web scraping

        url_petrole = f'https://query2.finance.yahoo.com/v8/finance/chart/CL=F?formatted=true&crumb=RoQtzbt66M5&lang=en-US&region=US&interval=1d&period1={cur_date_unix}&period2={next_date_unix}&events=div%7Csplit&corsDomain=finance.yahoo.com'
        result_p = requests.get(url_petrole, headers={'Referer': 'https://finance.yahoo.com/quote/CL%3DF/history?p=CL%3DF'})

        if result_p.json()['chart']['result'][0]['indicators']['adjclose'] != None : 

            adj_close_dict = result_p.json()['chart']['result'][0]['indicators']['adjclose'][0]

            if len(adj_close_dict) == 0 :

                df.loc[i, 'petrole_USD'] = np.nan



            elif len(adj_close_dict['adjclose']) == 1 : 

                df.loc[i,'petrole_USD'] = adj_close_dict['adjclose'][0]

            else : 

                length = len(adj_close_dict['adjclose'])

                df.loc[i,'petrole_USD'] = adj_close_dict['adjclose'][length - 1]

        else : 

            df.loc[i, 'petrole_USD'] = np.nan










print("")
print("CPU time for the third sub-module : %s seconds" % (time.time() - start_time))







return df   <span class="math-container">```</span>

0 Answers0