====== python pandas ====== {{INLINETOC}} === etc : list === set( [list] ) # unique value [list].sort() #자동적용? [list1] + [list2] #list합치기 ===== shape of df ===== === Pivot_table === df.pivot_table(index=[인덱스컬럼], columns=[컬럼1,컬럼2], values=[값], aggfunc='sum').reset_index() * string일 때는 aggfunc='max' * index에 NULL 있으면 안됨 == fillna == df[['a','b','c']] = df[['a','b','c']].fillna('') === group by === df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index() df.groupby([COLUMNS])['COLUMN'].max().reset_index() df = df.assign(date=pd.to_numeric(df['date'], errors='coerce')).groupby(['코드', '종목명']).agg({'date':np.min}).reset_index().drop_duplicates() df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.min, np.max]}).reset_index(level='종목코드') df.columns = df.columns.droplevel() === rank === df['rank'] = df.groupby('code')['value'].rank(ascending=False) === merge === df_out = df_out.merge(df, on=['no', 'name'], how='outer') #left_on right_on ===== modify ===== === Series to DF === df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(), ignore_index=True) === rename === df.rename(columns = {'컬럼':'new name'}, index={0:'new index'}) df.rename({1: 2, 2: 4}, axis='index') df.columns = [컬럼들..] df.columns = ['1'] + df.columns[1:].tolist() === order of columns === #1 df = df.sort_index(axis='columns', level = 'MULTILEVEL INDEX NAME/no') #2 df.columns col_order = ['a','b','c'] df = df.reindex(col_order, axis='columns') === map === df['코드'] = 'A' + df['코드'].map(lambda x: f'{x:0>6}') #6글자로 ===== get info ===== === Shapes === df.shape df.describe() df.info() #use the dataframe.nunique() function to find the unique values unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique']) df.loc[df.sex=='W', 'year_born'].hist() === type === df['value'] = pd.to_numeric(df['value'], errors='coerce') df['value'].dtype ===== get element ===== === Selects === iloc: Select by position loc: Select by label df.loc[:,~df.columns.isin(['a','b'])] df[~( df['a'].isin(['1','2','3']) & df['b']=='3' )] #row-wise df.loc[~( df['a'].isin(['1','2','3']) & df['b']=='3' ), 8] #row-wise & column === row iteration === for idx,row in anime[:2].iterrows(): print(idx, row) =====I/O file===== === encoding_errors - 'ignore'=== Encoding 제대로 했는데도 안되면.. 공공데이터가 이런 경우가 많음. Error tokenizing data. C error: EOF inside string starting at row 0 | 판다스 에러 https://con2joa.tistory.com/m/60 quoting=csv.QUOTE_NONE 파라미터 import chardet with open(file, 'rb') as rawdata: result = chardet.detect(rawdata.read(100000)) result data = pd.read_csv( file, encoding='cp949', encoding_errors='ignore') # on_bad_lines='skip' # error_bad_lines=False === to_numberic === #1 df = pd.read_csv('file.csv', encoding='utf-8', index_col=0, converters={'col':int, 'col2':str}) #2 df['col'] = pd.to_numeric(df[col].str.replace(',',''), errors='coerce') === Excel === writer = pd.ExcelWriter('file.xlsx', engine = 'xlsxwriter') df.to_excel(writer, sheet_name = '시트이름') writer.save() writer.close() === chunk === for index, chunk in enumerate( pd.read_csv(filename, chunksize=chunk_size, dtype=str), start=1): chunk.memory_usage() df_chunk = pd.read_csv(r'../input/data.csv', chunksize=1000000) # chink : # of lines for chunk in df_chunk: # perform data filtering chunk_filter = chunk_preprocessing(chunk) df3=pd.read_csv("abc.txt", skiprows=1, header=None, sep="|", nrows=100, encoding="utf-16") #, error_bad_lines=False) with open("abc.txt", encoding="utf-16") as f: i=0 for line in f: i=i+1 print(line) if i >3 : break file open/ append lines : https://thispointer.com/how-to-append-text-or-lines-to-a-file-in-python/ {{tag>software_development python 파이썬 pandas 데이터정리 data_analysis}} ~~DISCUSSION~~