Spaces:
Runtime error
Runtime error
Commit ·
d596fb5
1
Parent(s): f749736
made app.py better
Browse files
app.py
CHANGED
|
@@ -1,7 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
iface.launch()
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
import requests
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
|
| 7 |
import gradio as gr
|
| 8 |
+
from tree_sitter import Tree, Node
|
| 9 |
+
from tree_sitter_languages import get_parser
|
| 10 |
+
|
| 11 |
+
def non_whitespace_len(s: str) -> int: # new len function
|
| 12 |
+
return len(re.sub("\s", "", s))
|
| 13 |
+
|
| 14 |
+
def get_line_number(index: int, source_code: str) -> int:
|
| 15 |
+
total_chars = 0
|
| 16 |
+
for line_number, line in enumerate(source_code.splitlines(keepends=True), start=1):
|
| 17 |
+
total_chars += len(line)
|
| 18 |
+
if total_chars > index:
|
| 19 |
+
return line_number - 1
|
| 20 |
+
return line_number
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class Span:
|
| 24 |
+
# Represents a slice of a string
|
| 25 |
+
start: int = 0
|
| 26 |
+
end: int = 0
|
| 27 |
+
|
| 28 |
+
def __post_init__(self):
|
| 29 |
+
# If end is None, set it to start
|
| 30 |
+
if self.end is None:
|
| 31 |
+
self.end = self.start
|
| 32 |
+
|
| 33 |
+
def extract(self, s: str) -> str:
|
| 34 |
+
# Grab the corresponding substring of string s by bytes
|
| 35 |
+
return s[self.start: self.end]
|
| 36 |
+
|
| 37 |
+
def extract_lines(self, s: str) -> str:
|
| 38 |
+
# Grab the corresponding substring of string s by lines
|
| 39 |
+
return "\n".join(s.splitlines()[self.start:self.end])
|
| 40 |
+
|
| 41 |
+
def __add__(self, other: Span | int) -> Span:
|
| 42 |
+
# e.g. Span(1, 2) + Span(2, 4) = Span(1, 4) (concatenation)
|
| 43 |
+
# There are no safety checks: Span(a, b) + Span(c, d) = Span(a, d)
|
| 44 |
+
# and there are no requirements for b = c.
|
| 45 |
+
if isinstance(other, int):
|
| 46 |
+
return Span(self.start + other, self.end + other)
|
| 47 |
+
elif isinstance(other, Span):
|
| 48 |
+
return Span(self.start, other.end)
|
| 49 |
+
else:
|
| 50 |
+
raise NotImplementedError()
|
| 51 |
+
|
| 52 |
+
def __len__(self) -> int:
|
| 53 |
+
# i.e. Span(a, b) = b - a
|
| 54 |
+
return self.end - self.start
|
| 55 |
+
|
| 56 |
+
def chunk_tree(
|
| 57 |
+
tree: Tree,
|
| 58 |
+
source_code: bytes,
|
| 59 |
+
MAX_CHARS=512 * 3,
|
| 60 |
+
coalesce=50 # Any chunk less than 50 characters long gets coalesced with the next chunk
|
| 61 |
+
) -> list[Span]:
|
| 62 |
+
|
| 63 |
+
# 1. Recursively form chunks based on the last post (https://docs.sweep.dev/blogs/chunking-2m-files)
|
| 64 |
+
def chunk_node(node: Node) -> list[Span]:
|
| 65 |
+
chunks: list[Span] = []
|
| 66 |
+
current_chunk: Span = Span(node.start_byte, node.start_byte)
|
| 67 |
+
node_children = node.children
|
| 68 |
+
for child in node_children:
|
| 69 |
+
if child.end_byte - child.start_byte > MAX_CHARS:
|
| 70 |
+
chunks.append(current_chunk)
|
| 71 |
+
current_chunk = Span(child.end_byte, child.end_byte)
|
| 72 |
+
chunks.extend(chunk_node(child))
|
| 73 |
+
elif child.end_byte - child.start_byte + len(current_chunk) > MAX_CHARS:
|
| 74 |
+
chunks.append(current_chunk)
|
| 75 |
+
current_chunk = Span(child.start_byte, child.end_byte)
|
| 76 |
+
else:
|
| 77 |
+
current_chunk += Span(child.start_byte, child.end_byte)
|
| 78 |
+
chunks.append(current_chunk)
|
| 79 |
+
return chunks
|
| 80 |
+
chunks = chunk_node(tree.root_node)
|
| 81 |
+
|
| 82 |
+
# 2. Filling in the gaps
|
| 83 |
+
for prev, curr in zip(chunks[:-1], chunks[1:]):
|
| 84 |
+
prev.end = curr.start
|
| 85 |
+
curr.start = tree.root_node.end_byte
|
| 86 |
+
|
| 87 |
+
# 3. Combining small chunks with bigger ones
|
| 88 |
+
new_chunks = []
|
| 89 |
+
current_chunk = Span(0, 0)
|
| 90 |
+
for chunk in chunks:
|
| 91 |
+
current_chunk += chunk
|
| 92 |
+
if non_whitespace_len(current_chunk.extract(source_code.decode("utf-8"))) > coalesce \
|
| 93 |
+
and "\n" in current_chunk.extract(source_code.decode("utf-8")):
|
| 94 |
+
new_chunks.append(current_chunk)
|
| 95 |
+
current_chunk = Span(chunk.end, chunk.end)
|
| 96 |
+
if len(current_chunk) > 0:
|
| 97 |
+
new_chunks.append(current_chunk)
|
| 98 |
+
|
| 99 |
+
# 4. Changing line numbers
|
| 100 |
+
line_chunks = [
|
| 101 |
+
Span(
|
| 102 |
+
get_line_number(chunk.start, source_code),
|
| 103 |
+
get_line_number(chunk.end, source_code)
|
| 104 |
+
)
|
| 105 |
+
for chunk in new_chunks
|
| 106 |
+
]
|
| 107 |
+
|
| 108 |
+
# 5. Eliminating empty chunks
|
| 109 |
+
line_chunks = [chunk for chunk in line_chunks if len(chunk) > 0]
|
| 110 |
+
|
| 111 |
+
return line_chunks
|
| 112 |
+
|
| 113 |
+
css = """
|
| 114 |
+
.code_container {
|
| 115 |
+
}
|
| 116 |
+
"""
|
| 117 |
+
|
| 118 |
+
def chunk_code(
|
| 119 |
+
code: str,
|
| 120 |
+
language: str,
|
| 121 |
+
MAX_CHARS: int,
|
| 122 |
+
coalesce: int
|
| 123 |
+
):
|
| 124 |
+
try:
|
| 125 |
+
parser = get_parser(language)
|
| 126 |
+
tree = parser.parse(code.encode("utf-8"))
|
| 127 |
+
chunks = chunk_tree(tree, code.encode("utf-8"), MAX_CHARS=MAX_CHARS, coalesce=coalesce)
|
| 128 |
+
chunks = [chunk.extract_lines(code) for chunk in chunks]
|
| 129 |
+
return "\n\n====================\n\n".join(chunks)
|
| 130 |
+
except Exception as e:
|
| 131 |
+
return str(e)
|
| 132 |
+
|
| 133 |
+
with gr.Blocks(css=css) as demo:
|
| 134 |
+
gr.Markdown("Start typing below and the chunked output will automatically show up.")
|
| 135 |
+
|
| 136 |
+
default_file = "https://raw.githubusercontent.com/sweepai/sweep/b267b613d4c706eaf959fe6789f11e9a856521d1/sweepai/handlers/on_check_suite.py"
|
| 137 |
+
default_code = requests.get(default_file).text
|
| 138 |
+
|
| 139 |
+
with gr.Row():
|
| 140 |
+
language = gr.Dropdown(["python", "javascript", "go", "ruby", "java", "php", "c", "cpp", "rust", "haskell"], label="Language", value="python")
|
| 141 |
+
max_chars = gr.Slider(100, 3000, 1500, label="Max Characters")
|
| 142 |
+
coalesce = gr.Slider(0, 300, 100, label="Coalesce")
|
| 143 |
+
with gr.Row():
|
| 144 |
+
inp = gr.Code(placeholder="Enter the code here", label="Code to Chunk", language=language.value, lines=60, elem_classes="code_container", value=default_code)
|
| 145 |
+
out = gr.Code(label="Chunked Code", language=language.value, lines=60, value=chunk_code(default_code, language.value, max_chars.value, coalesce.value))
|
| 146 |
+
|
| 147 |
+
def update_language(inp, language, max_chars, coalesce):
|
| 148 |
+
return (
|
| 149 |
+
gr.update(language=language),
|
| 150 |
+
gr.update(language=language, value=chunk_code(inp.value, language, max_chars, coalesce))
|
| 151 |
+
)
|
| 152 |
|
| 153 |
+
language.change(fn=update_language, inputs=[inp, language, max_chars, coalesce], outputs=[inp, out])
|
| 154 |
+
max_chars.change(fn=chunk_code, inputs=[inp, language, max_chars, coalesce], outputs=out)
|
| 155 |
+
coalesce.change(fn=chunk_code, inputs=[inp, language, max_chars, coalesce], outputs=out)
|
| 156 |
+
inp.change(fn=chunk_code, inputs=[inp, language, max_chars, coalesce], outputs=out)
|
| 157 |
|
| 158 |
+
demo.launch()
|
|
|