Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
software_development:python_pandas [2022/08/04 14:39]
prgram
software_development:python_pandas [2023/05/16 15:17] (current)
prgram [encoding_errors - 'ignore']
Line 2: Line 2:
 {{INLINETOC}} {{INLINETOC}}
  
-=== Shapes ​===+=== etc : list ===
 <code python> <code python>
-df.shape +set[list] ​# unique value 
-df.describe() +[list].sort() #​자동적용?​ 
-df.info()+[list1] + [list2] ​ #​list합치기
 </​code>​ </​code>​
  
-#use the dataframe.nunique() function to find the unique values +===== shape of df ===== 
-unique_counts ​pd.DataFrame.from_records([(col,​ df[col].nunique()) for col in df.columns],​columns=['​Column_Name',​ '​Num_Unique'​]).sort_values(by=['​Num_Unique'​]) +=== Pivot_table ​===
- +
-df.loc[df.sex=='​W',​ '​year_born'​].hist() +
- +
-=== Selects ​===+
 <code python> <code python>
-iloc: Select by position +df.pivot_table(index=[인덱스컬럼],​ 
-loc: Select by label+               columns=[컬럼1,​컬럼2],​ 
 +               ​values=[값],​ 
 +               ​aggfunc='​sum'​).reset_index() 
 +</​code>​ 
 +  * string일 때는 aggfunc='​max'​ 
 +  * index에 NULL 있으면 안됨 
 +== fillna == 
 +<code python>​ 
 +df[['​a','​b','​c'​]] = df[['​a','​b','​c'​]].fillna(''​)
 </​code>​ </​code>​
  
 +=== group by ===
 +<code python>
 +df.groupby([컬럼들]).agg({'​컬럼':​sum}).reset_index()
  
-   +df.groupby([COLUMNS])['​COLUMN'​].max().reset_index()
-   +
-==I/O file==  ​+
  
 +df = df.assign(date=pd.to_numeric(df['​date'​],​ errors='​coerce'​)).groupby(['​코드',​ '​종목명'​]).agg({'​date':​np.min}).reset_index().drop_duplicates()
  
-=== chunk === +df df[['​코드'​'​date'​]].groupby(['​코드'​]).agg({'​date':​ [np.minnp.max]}).reset_index(level='​종목코드'​
-<code python>​ +df.columns = df.columns.droplevel()
-for indexchunk in enumeratepd.read_csv(filenamechunksize=chunk_size,​ dtype=str), start=1): +
-  chunk.memory_usage()+
 </​code>​ </​code>​
  
 +=== rank ===
 <code python> <code python>
-df_chunk ​pd.read_csv(r'../​input/​data.csv', chunksize=1000000# chink : # of lines +df['​rank'​] ​df.groupby('code')['​value'​].rank(ascending=False)
-for chunk in df_chunk: ​  +
-    # perform data filtering  +
-    chunk_filter = chunk_preprocessing(chunk)+
 </​code>​ </​code>​
  
 +=== merge ===
 <code python> <code python>
-df3=pd.read_csv("​abc.txt"​skiprows=1header=None, +df_out ​df_out.merge(dfon=['​no'​'​name'​]how='​outer'​) #left_on right_on
-               sep="​|",​ nrows=100, encoding="​utf-16"​) #, error_bad_lines=False) +
- +
-with open("​abc.txt",​ encoding="​utf-16"​) as f: +
-    i=0 +
-    for line in f: +
-        i=i+1 +
-        print(line) +
-        if i >3 : break +
-</​code>​ +
-           +
-           +
-file open/ append lines : https://​thispointer.com/​how-to-append-text-or-lines-to-a-file-in-python/ ​          +
-=== row iteration === +
-<code python>​ +
-for idx,row in anime[:​2].iterrows():​ +
-    print(idx, row)+
 </​code>​ </​code>​
  
  
 +===== modify =====
 === Series to DF === === Series to DF ===
 <code python> <code python>
 df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(),​ ignore_index=True) df = df.append(pd.DataFrame(pd.Series(dict_row)).transpose(),​ ignore_index=True)
 </​code>​ </​code>​
 +
 +
 === rename === === rename ===
 <code python> <code python>
Line 72: Line 62:
 df.columns = ['​1'​] + df.columns[1:​].tolist() df.columns = ['​1'​] + df.columns[1:​].tolist()
 </​code>​ </​code>​
 +
 +=== order of columns ===
 +<code python>
 +#1
 +df = df.sort_index(axis='​columns',​ level = '​MULTILEVEL INDEX NAME/​no'​)
 +#2
 +df.columns
 +col_order = ['​a','​b','​c'​]
 +df = df.reindex(col_order,​ axis='​columns'​)
 +</​code>​
 +
  
 === map === === map ===
Line 77: Line 78:
 df['​코드'​] = '​A'​ + df['​코드'​].map(lambda x: f'​{x:​0>​6}'​) ​ #6글자로 df['​코드'​] = '​A'​ + df['​코드'​].map(lambda x: f'​{x:​0>​6}'​) ​ #6글자로
 </​code>​ </​code>​
-=== Pivot_table ​===+ 
 + 
 +===== get info ===== 
 + 
 +=== Shapes ​===
 <code python> <code python>
-df.pivot_table(index=[인덱스컬럼],​ +df.shape 
-               columns=[컬럼1,​컬럼2],​ +df.describe() 
-               ​values=[값],​ +df.info()
-               ​aggfunc='​sum'​).reset_index()+
 </​code>​ </​code>​
  
-=== group by ===+#use the dataframe.nunique() function to find the unique values 
 +unique_counts ​pd.DataFrame.from_records([(col,​ df[col].nunique()) for col in df.columns],​columns=['​Column_Name',​ '​Num_Unique'​]).sort_values(by=['​Num_Unique'​]) 
 + 
 +df.loc[df.sex=='​W',​ '​year_born'​].hist() 
 + 
 +=== type ===
 <code python> <code python>
-df.groupby([컬럼들]).agg({'컬럼':sum}).reset_index()+df['​value'​= pd.to_numeric(df['value'​],​ errors='​coerce') 
 +df['​value'​].dtype 
 +</​code>​
  
-df = df.assign(date=pd.to_numeric(df['​date'​],​ errors='​coerce'​)).groupby(['​코드',​ '​종목명'​]).agg({'​date':​np.min}).reset_index().drop_duplicates() 
  
-df = df[['코드', 'date']].groupby(['코드']).agg({'date': [np.minnp.max]}).reset_index(level='종목코드') +===== get element ===== 
-df.columns = df.columns.droplevel()+ 
 +=== Selects === 
 +<code python>​ 
 +iloc: Select by position 
 +loc: Select by label 
 +   
 +df.loc[:,​~df.columns.isin(['a','​b'])  
 + 
 +df[~df['a'].isin(['1','​2','​3'​]) & df['​b'​]=='3' )] #​row-wise 
 +df.loc[~( ​df['​a'​].isin(['​1','​2','​3'​]& df['​b'​]=='​3'​ ), 8] #​row-wise & column
 </​code>​ </​code>​
  
-=== merge ===+=== row iteration ​===
 <code python> <code python>
-df_out = df_out.merge(dfon=['​no',​ '​name'​], how='​outer'​)+for idx,row in anime[:2].iterrows():​ 
 +    print(idxrow)
 </​code>​ </​code>​
  
-=== rank ===+   
 +   
 +=====I/O file===== 
 + 
 +=== encoding_errors - '​ignore'​=== 
 +Encoding 제대로 했는데도 안되면.. 
 +공공데이터가 이런 경우가 많음. 
 + 
 +Error tokenizing data. C error: EOF inside string starting at row 0 | 판다스 에러 
 +https://​con2joa.tistory.com/​m/​60 
 +quoting=csv.QUOTE_NONE 파라미터 
 <code python> <code python>
-df['rank'df.groupby('code')['value'].rank(ascending=False)+import chardet 
 +with open(file, ​'rb') as rawdata: 
 +    result ​chardet.detect(rawdata.read(100000)) 
 +result 
 + 
 + 
 +data = pd.read_csv( file, encoding='cp949',​ encoding_errors='​ignore') 
 +# on_bad_lines='skip' 
 +# error_bad_lines=False
 </​code>​ </​code>​
 +
 === to_numberic === === to_numberic ===
 <code python> <code python>
Line 120: Line 160:
 </​code>​ </​code>​
  
 +=== chunk ===
 +<code python>
 +for index, chunk in enumerate( pd.read_csv(filename,​ chunksize=chunk_size,​ dtype=str), start=1):
 +  chunk.memory_usage()
 +</​code>​
  
-=== type === 
 <code python> <code python>
-df['​value'​] ​= pd.to_numeric(df['value']errors='​coerce'​+df_chunk ​= pd.read_csv(r'../​input/​data.csv', ​chunksize=1000000# chink : # of lines 
-df['​value'​].dtype+for chunk in df_chunk: ​  
 +    # perform data filtering  
 +    chunk_filter = chunk_preprocessing(chunk)
 </​code>​ </​code>​
 +
 +<code python>
 +df3=pd.read_csv("​abc.txt",​ skiprows=1, header=None,​
 +               ​sep="​|",​ nrows=100, encoding="​utf-16"​) #, error_bad_lines=False)
 +
 +with open("​abc.txt",​ encoding="​utf-16"​) as f:
 +    i=0
 +    for line in f:
 +        i=i+1
 +        print(line)
 +        if i >3 : break
 +</​code>​
 +          ​
 +          ​
 +file open/ append lines : https://​thispointer.com/​how-to-append-text-or-lines-to-a-file-in-python/ ​         ​
 +
 +
 +
 +
 +
 +
 +
  
 {{tag>​software_development python 파이썬 pandas 데이터정리 data_analysis}} {{tag>​software_development python 파이썬 pandas 데이터정리 data_analysis}}