====== python pandas ======
{{INLINETOC}}
=== etc : list ===
set( [list] ) # unique value
[list].sort() #자동적용?
[list1] + [list2] #list합치기
===== shape of df =====
=== Pivot_table ===
df.pivot_table(index=[인덱스컬럼],
columns=[컬럼1,컬럼2],
values=[값],
aggfunc='sum').reset_index()
* string일 때는 aggfunc='max'
* index에 NULL 있으면 안됨
== fillna ==
df[['a','b','c']] = df[['a','b','c']].fillna('')
=== group by ===
df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index()
df.groupby([COLUMNS])['COLUMN'].max().reset_index()
df = df.assign(date=pd.to_numeric(df['date'], errors='coerce')).groupby(['코드', '종목명']).agg({'date':np.min}).reset_index().drop_duplicates()
df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.min, np.max]}).reset_index(level='종목코드')
df.columns = df.columns.droplevel()
=== rank ===
df['rank'] = df.groupby('code')['value'].rank(ascending=False)
=== merge ===
df_out = df_out.merge(df, on=['no', 'name'], how='outer') #left_on right_on
===== modify =====
=== Series to DF ===
df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(), ignore_index=True)
=== rename ===
df.rename(columns = {'컬럼':'new name'}, index={0:'new index'})
df.rename({1: 2, 2: 4}, axis='index')
df.columns = [컬럼들..]
df.columns = ['1'] + df.columns[1:].tolist()
=== order of columns ===
#1
df = df.sort_index(axis='columns', level = 'MULTILEVEL INDEX NAME/no')
#2
df.columns
col_order = ['a','b','c']
df = df.reindex(col_order, axis='columns')
=== map ===
df['코드'] = 'A' + df['코드'].map(lambda x: f'{x:0>6}') #6글자로
===== get info =====
=== Shapes ===
df.shape
df.describe()
df.info()
#use the dataframe.nunique() function to find the unique values
unique_counts = pd.DataFrame.from_records([(col, df[col].nunique()) for col in df.columns],columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
df.loc[df.sex=='W', 'year_born'].hist()
=== type ===
df['value'] = pd.to_numeric(df['value'], errors='coerce')
df['value'].dtype
===== get element =====
=== Selects ===
iloc: Select by position
loc: Select by label
df.loc[:,~df.columns.isin(['a','b'])]
df[~( df['a'].isin(['1','2','3']) & df['b']=='3' )] #row-wise
df.loc[~( df['a'].isin(['1','2','3']) & df['b']=='3' ), 8] #row-wise & column
=== row iteration ===
for idx,row in anime[:2].iterrows():
print(idx, row)
=====I/O file=====
=== encoding_errors - 'ignore'===
Encoding 제대로 했는데도 안되면..
공공데이터가 이런 경우가 많음.
Error tokenizing data. C error: EOF inside string starting at row 0 | 판다스 에러
https://con2joa.tistory.com/m/60
quoting=csv.QUOTE_NONE 파라미터
import chardet
with open(file, 'rb') as rawdata:
result = chardet.detect(rawdata.read(100000))
result
data = pd.read_csv( file, encoding='cp949', encoding_errors='ignore')
# on_bad_lines='skip'
# error_bad_lines=False
=== to_numberic ===
#1
df = pd.read_csv('file.csv', encoding='utf-8', index_col=0, converters={'col':int, 'col2':str})
#2
df['col'] = pd.to_numeric(df[col].str.replace(',',''), errors='coerce')
=== Excel ===
writer = pd.ExcelWriter('file.xlsx', engine = 'xlsxwriter')
df.to_excel(writer, sheet_name = '시트이름')
writer.save()
writer.close()
=== chunk ===
for index, chunk in enumerate( pd.read_csv(filename, chunksize=chunk_size, dtype=str), start=1):
chunk.memory_usage()
df_chunk = pd.read_csv(r'../input/data.csv', chunksize=1000000) # chink : # of lines
for chunk in df_chunk:
# perform data filtering
chunk_filter = chunk_preprocessing(chunk)
df3=pd.read_csv("abc.txt", skiprows=1, header=None,
sep="|", nrows=100, encoding="utf-16") #, error_bad_lines=False)
with open("abc.txt", encoding="utf-16") as f:
i=0
for line in f:
i=i+1
print(line)
if i >3 : break
file open/ append lines : https://thispointer.com/how-to-append-text-or-lines-to-a-file-in-python/
{{tag>software_development python 파이썬 pandas 데이터정리 data_analysis}}
~~DISCUSSION~~