Omnibus commited on
Commit
f955f7a
·
verified ·
1 Parent(s): f6f72b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +166 -34
app.py CHANGED
@@ -14,6 +14,7 @@ import uuid
14
  #from query import tasks
15
  from agent import (
16
  PREFIX,
 
17
  COMPRESS_DATA_PROMPT,
18
  COMPRESS_DATA_PROMPT_SMALL,
19
  LOG_PROMPT,
@@ -24,6 +25,10 @@ api=HfApi()
24
  client = InferenceClient(
25
  "mistralai/Mixtral-8x7B-Instruct-v0.1"
26
  )
 
 
 
 
27
 
28
  def find_all(url):
29
  return_list=[]
@@ -289,7 +294,7 @@ def compress_data_og(c, instruct, history):
289
 
290
 
291
 
292
- def summarize(inp,history,report_check,data=None,files=None,url=None,pdf_url=None,pdf_batch=None):
293
  json_box=[]
294
  if inp == "":
295
  inp = "Process this data"
@@ -313,9 +318,6 @@ def summarize(inp,history,report_check,data=None,files=None,url=None,pdf_url=Non
313
  except Exception as e:
314
  print(e)
315
  #data=f'{data}\nError reading URL ({batch_url})'
316
-
317
-
318
-
319
  if pdf_url.startswith("http"):
320
  print("PDF_URL")
321
  out = read_pdf_online(pdf_url)
@@ -352,41 +354,170 @@ def summarize(inp,history,report_check,data=None,files=None,url=None,pdf_url=Non
352
  if i == " " or i=="," or i=="\n":
353
  c +=1
354
  print (f'c:: {c}')
355
-
356
- json_out = compress_data(c,inp,out)
357
- #json_box.append(json_out)
358
-
359
- #json_object = json.dumps(eval(json_out), indent=4)
360
- #json_box.append(json_out)
361
- print(f'JSON_BOX:: {json_out}')
362
- # Writing to sample.json
363
- #with open("tmp.json", "w") as outfile:
364
- # outfile.write(json_object)
365
- #outfile.close()
366
-
367
- #json_box.append(json_out)
368
- out = str(json_out)
369
- if report_check:
370
- rl = len(out)
371
- print(f'rl:: {rl}')
372
- c=1
373
- for i in str(out):
374
- if i == " " or i=="," or i=="\n":
375
- c +=1
376
- print (f'c2:: {c}')
377
- rawp = compress_data_og(c,inp,out)
378
- else:
379
- rawp = out
380
  else:
381
  rawp = "Provide a valid data source"
382
- #print (rawp)
383
- #print (f'out:: {out}')
384
- #history += "observation: the search results are:\n {}\n".format(out)
385
- #task = "complete?"
386
  history.clear()
387
  history.append((inp,rawp))
388
  yield "", history,error_box,json_out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  #################################
391
  def clear_fn():
392
  return "",[(None,None)]
@@ -399,6 +530,7 @@ with gr.Blocks() as app:
399
  prompt=gr.Textbox(label = "Instructions (optional)")
400
  with gr.Column(scale=1):
401
  report_check=gr.Checkbox(label="Return Report", value=True)
 
402
  button=gr.Button()
403
 
404
  #models_dd=gr.Dropdown(choices=[m for m in return_list],interactive=True)
@@ -421,6 +553,6 @@ with gr.Blocks() as app:
421
  #text=gr.JSON()
422
  #inp_query.change(search_models,inp_query,models_dd)
423
  clear_btn.click(clear_fn,None,[prompt,chatbot])
424
- go=button.click(summarize,[prompt,chatbot,report_check,data,file,url,pdf_url,pdf_batch],[prompt,chatbot,e_box,json_out])
425
  stop_button.click(None,None,None,cancels=[go])
426
  app.queue(default_concurrency_limit=20).launch(server_port=7860,show_api=False)
 
14
  #from query import tasks
15
  from agent import (
16
  PREFIX,
17
+ SAVE_MEMORY,
18
  COMPRESS_DATA_PROMPT,
19
  COMPRESS_DATA_PROMPT_SMALL,
20
  LOG_PROMPT,
 
25
  client = InferenceClient(
26
  "mistralai/Mixtral-8x7B-Instruct-v0.1"
27
  )
28
+ reponame="Omnibus/tmp"
29
+ save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
30
+ token_self = os.environ['HF_TOKEN']
31
+ api=HfApi(token=token_self)
32
 
33
  def find_all(url):
34
  return_list=[]
 
294
 
295
 
296
 
297
+ def summarize(inp,history,report_check,sum_mem_check,data=None,files=None,url=None,pdf_url=None,pdf_batch=None):
298
  json_box=[]
299
  if inp == "":
300
  inp = "Process this data"
 
318
  except Exception as e:
319
  print(e)
320
  #data=f'{data}\nError reading URL ({batch_url})'
 
 
 
321
  if pdf_url.startswith("http"):
322
  print("PDF_URL")
323
  out = read_pdf_online(pdf_url)
 
354
  if i == " " or i=="," or i=="\n":
355
  c +=1
356
  print (f'c:: {c}')
357
+ if sum_mem_check=="Memory":
358
+ save_memory(inp,out)
359
+ rawp = "Complete"
360
+ if sum_mem_check=="Summarize":
361
+ json_out = compress_data(c,inp,out)
362
+
363
+ out = str(json_out)
364
+ if report_check:
365
+ rl = len(out)
366
+ print(f'rl:: {rl}')
367
+ c=1
368
+ for i in str(out):
369
+ if i == " " or i=="," or i=="\n":
370
+ c +=1
371
+ print (f'c2:: {c}')
372
+ rawp = compress_data_og(c,inp,out)
373
+ else:
374
+ rawp = out
 
 
 
 
 
 
 
375
  else:
376
  rawp = "Provide a valid data source"
 
 
 
 
377
  history.clear()
378
  history.append((inp,rawp))
379
  yield "", history,error_box,json_out
380
+ SAVE_MEMORY = """
381
+ You are attempting to complete the task
382
+ task: {task}
383
+ Data:
384
+ {history}
385
+ Instructions:
386
+ Compile and categorize the data above into a JSON dictionary string
387
+ Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format
388
+ Your final response should be only the final formatted JSON string enclosed in brackets, and nothing else.
389
+ Required keys:
390
+ "keywords":["short", "list", "of", "keywords", "relevant", "to", "this", "entry"]
391
+ "title":"title of entry"
392
+ "description":"description of entry"
393
+ "content":"full content of data about entry"
394
+ "url":"https://url.source"
395
+ """
396
+
397
+ def save_memory(purpose, history):
398
+ uid=uuid.uuid4()
399
+ history=str(history)
400
+ c=0
401
+ inp = str(history)
402
+ rl = len(inp)
403
+ print(f'rl:: {rl}')
404
+ for i in str(inp):
405
+ if i == " " or i=="," or i=="\n" or i=="/" or i=="." or i=="<":
406
+ c +=1
407
+ print (f'c:: {c}')
408
 
409
+ seed=random.randint(1,1000000000)
410
+
411
+ print (c)
412
+ #tot=len(purpose)
413
+ #print(tot)
414
+ divr=int(c)/MAX_DATA
415
+ divi=int(divr)+1 if divr != int(divr) else int(divr)
416
+ chunk = int(int(c)/divr)
417
+ print(f'chunk:: {chunk}')
418
+ print(f'divr:: {divr}')
419
+ print (f'divi:: {divi}')
420
+ #out = []
421
+ #out=""
422
+ s=0
423
+ e=chunk
424
+ print(f'e:: {e}')
425
+ new_history=""
426
+ task = f'Index this Data\n'
427
+ for z in range(divi):
428
+ print(f's:e :: {s}:{e}')
429
+
430
+ hist = inp[s:e]
431
+
432
+ resp = run_gpt(
433
+ SAVE_MEMORY,
434
+ stop_tokens=["observation:", "task:", "action:", "thought:"],
435
+ max_tokens=4096,
436
+ seed=seed,
437
+ purpose=purpose,
438
+ task=task,
439
+ history=hist,
440
+ ).strip('\n')
441
+ #new_history = resp
442
+ print (resp)
443
+ #out+=resp
444
+ e=e+chunk
445
+ s=s+chunk
446
+ print ("final1" + resp)
447
+ try:
448
+ resp='[{'+resp.split('[{')[1].split('</s>')[0]
449
+ print ("final2\n" + resp)
450
+ print(f"keywords:: {resp['keywords']}")
451
+ except Exception as e:
452
+ resp = resp
453
+ print(e)
454
+ timestamp=str(datetime.datetime.now())
455
+ timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
456
+ json_object=resp
457
+ #json_object = json.dumps(out_box)
458
+ #json_object = json.dumps(out_box,indent=4)
459
+ with open(f"tmp-{uid}.json", "w") as outfile:
460
+ outfile.write(json_object)
461
+ api.upload_file(
462
+ path_or_fileobj=f"tmp-{uid}.json",
463
+ path_in_repo=f"/mem-test2/{timename}.json",
464
+ repo_id=reponame,
465
+ #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
466
+ token=token_self,
467
+ repo_type="dataset",
468
+ )
469
+ lines = resp.strip().strip("\n").split("\n")
470
+ r = requests.get(f'{save_data}mem-test2/main.json')
471
+ print(f'status code main:: {r.status_code}')
472
+ if r.status_code==200:
473
+
474
+ lod = json.loads(r.text)
475
+ #lod = eval(lod)
476
+ print (f'lod:: {lod}')
477
+ else:
478
+ lod = []
479
+ for i,line in enumerate(lines):
480
+ key_box=[]
481
+ print(f'LINE:: {line}')
482
+ if ":" in line:
483
+ print(f'line:: {line}')
484
+
485
+ if "keywords" in line[:16]:
486
+ print(f'trying:: {line}')
487
+ keyw=line.split(":")[1]
488
+ print (keyw)
489
+ print (keyw.split("[")[1].split("]")[0])
490
+ keyw=keyw.split("[")[1].split("]")[0]
491
+ for ea in keyw.split(","):
492
+ s1=""
493
+ ea=ea.strip().strip("\n")
494
+ for ev in ea:
495
+ if ev.isalnum():
496
+ s1+=ev
497
+ if ev == " ":
498
+ s1+=ev
499
+ #ea=s1
500
+ print(s1)
501
+ key_box.append(s1)
502
+ lod.append({"file_name":timename,"keywords":key_box})
503
+ json_object = json.dumps(lod, indent=4)
504
+ with open(f"tmp2-{uid}.json", "w") as outfile2:
505
+ outfile2.write(json_object)
506
+ api.upload_file(
507
+ path_or_fileobj=f"tmp2-{uid}.json",
508
+ path_in_repo=f"/mem-test2/main.json",
509
+ repo_id=reponame,
510
+ #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
511
+ token=token_self,
512
+ repo_type="dataset",
513
+ )
514
+
515
+ #return [resp]
516
+
517
+
518
+
519
+
520
+
521
  #################################
522
  def clear_fn():
523
  return "",[(None,None)]
 
530
  prompt=gr.Textbox(label = "Instructions (optional)")
531
  with gr.Column(scale=1):
532
  report_check=gr.Checkbox(label="Return Report", value=True)
533
+ sum_mem_check=gr.Radio(label="Output",choices=["Summary","Memory"])
534
  button=gr.Button()
535
 
536
  #models_dd=gr.Dropdown(choices=[m for m in return_list],interactive=True)
 
553
  #text=gr.JSON()
554
  #inp_query.change(search_models,inp_query,models_dd)
555
  clear_btn.click(clear_fn,None,[prompt,chatbot])
556
+ go=button.click(summarize,[prompt,chatbot,report_check,sum_mem_check,data,file,url,pdf_url,pdf_batch],[prompt,chatbot,e_box,json_out])
557
  stop_button.click(None,None,None,cancels=[go])
558
  app.queue(default_concurrency_limit=20).launch(server_port=7860,show_api=False)